diff --git a/docs/tasks.md b/docs/tasks.md index 81116d3138..bcce70245f 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -242,6 +242,7 @@ The following tables give you an overview of the tasks in MTEB. | [DKHateClassification](https://aclanthology.org/2020.lrec-1.430/) (Sigurbergsson et al., 2020) | ['dan'] | Classification | s2s | [Social, Written] | None | None | | [DTD](https://www.robots.ox.ac.uk/~vgg/data/dtd/) (M. Cimpoi, 2014) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 1880} | {'test': {'num_samples': 1880, 'unique_num_labels': 47, 'min_image_width': 300, 'average_image_width': 488.98, 'max_image_width': 900, 'min_image_height': 300, 'average_image_height': 447.5, 'max_image_height': 778, 'labels': {'0': {'count': 40}, '1': {'count': 40}, '10': {'count': 40}, '11': {'count': 40}, '12': {'count': 40}, '13': {'count': 40}, '14': {'count': 40}, '15': {'count': 40}, '16': {'count': 40}, '17': {'count': 40}, '18': {'count': 40}, '19': {'count': 40}, '2': {'count': 40}, '20': {'count': 40}, '21': {'count': 40}, '22': {'count': 40}, '23': {'count': 40}, '24': {'count': 40}, '25': {'count': 40}, '26': {'count': 40}, '27': {'count': 40}, '28': {'count': 40}, '29': {'count': 40}, '3': {'count': 40}, '30': {'count': 40}, '31': {'count': 40}, '32': {'count': 40}, '33': {'count': 40}, '34': {'count': 40}, '35': {'count': 40}, '36': {'count': 40}, '37': {'count': 40}, '38': {'count': 40}, '39': {'count': 40}, '4': {'count': 40}, '40': {'count': 40}, '41': {'count': 40}, '42': {'count': 40}, '43': {'count': 40}, '44': {'count': 40}, '45': {'count': 40}, '46': {'count': 40}, '5': {'count': 40}, '6': {'count': 40}, '7': {'count': 40}, '8': {'count': 40}, '9': {'count': 40}}}} | | [DTDZeroShot](https://www.robots.ox.ac.uk/~vgg/data/dtd/) (M. Cimpoi, 2014) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | {'test': 1880} | {'test': {'num_samples': 1880, 'unique_num_labels': 47, 'min_image_width': 300, 'average_image_width': 488.98, 'max_image_width': 900, 'min_image_height': 300, 'average_image_height': 447.5, 'max_image_height': 778, 'min_label_text_length': 24, 'average_label_text_length': 27.38, 'max_label_text_length': 32, 'labels': {'0': {'count': 40}, '1': {'count': 40}, '10': {'count': 40}, '11': {'count': 40}, '12': {'count': 40}, '13': {'count': 40}, '14': {'count': 40}, '15': {'count': 40}, '16': {'count': 40}, '17': {'count': 40}, '18': {'count': 40}, '19': {'count': 40}, '2': {'count': 40}, '20': {'count': 40}, '21': {'count': 40}, '22': {'count': 40}, '23': {'count': 40}, '24': {'count': 40}, '25': {'count': 40}, '26': {'count': 40}, '27': {'count': 40}, '28': {'count': 40}, '29': {'count': 40}, '3': {'count': 40}, '30': {'count': 40}, '31': {'count': 40}, '32': {'count': 40}, '33': {'count': 40}, '34': {'count': 40}, '35': {'count': 40}, '36': {'count': 40}, '37': {'count': 40}, '38': {'count': 40}, '39': {'count': 40}, '4': {'count': 40}, '40': {'count': 40}, '41': {'count': 40}, '42': {'count': 40}, '43': {'count': 40}, '44': {'count': 40}, '45': {'count': 40}, '46': {'count': 40}, '5': {'count': 40}, '6': {'count': 40}, '7': {'count': 40}, '8': {'count': 40}, '9': {'count': 40}}}} | +| [DadoEvalCoarseClassification](https://github.com/dhfbk/DaDoEval) (Menini et al., 2020) | ['ita'] | Classification | s2s | [Written] | None | None | | [DalajClassification](https://spraakbanken.gu.se/en/resources/superlim) (Elena Volodina, 2021) | ['swe'] | Classification | s2s | [Non-fiction, Written] | None | None | | [DanFeverRetrieval](https://aclanthology.org/2021.nodalida-main.47/) (N{\o, 2021) | ['dan'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Spoken] | None | None | | [DanishMedicinesAgencyBitextMining](https://sprogteknologi.dk/dataset/bilingual-english-danish-parallel-corpus-from-the-danish-medicines-agency) (Rozis et al., 2019) | ['dan', 'eng'] | BitextMining | s2s | [Medical, Written] | None | None | @@ -251,6 +252,7 @@ The following tables give you an overview of the tasks in MTEB. | [DiaBlaBitextMining](https://inria.hal.science/hal-03021633) (González et al., 2019) | ['eng', 'fra'] | BitextMining | s2s | [Social, Written] | None | None | | [DigikalamagClassification](https://hooshvare.github.io/docs/datasets/tc) | ['fas'] | Classification | p2p | [Web] | None | None | | [DigikalamagClustering](https://hooshvare.github.io/docs/datasets/tc) | ['fas'] | Clustering | p2p | [Web] | None | None | +| [DisCoTexPairClassification](https://github.com/davidecolla/DisCoTex) (Brunato et al., 2023) | ['ita'] | PairClassification | s2s | [Social, Written] | None | None | | [Diversity1LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [Diversity2LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [Diversity3LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | @@ -263,6 +265,7 @@ The following tables give you an overview of the tasks in MTEB. | [ESCIReranking](https://github.com/amazon-science/esci-data/) (Chandan K. Reddy, 2022) | ['eng', 'jpn', 'spa'] | Reranking | s2p | [Written] | {'test': 29285} | {'test': {'num_samples': 29285, 'number_of_characters': 254538331, 'num_positive': 271416, 'num_negative': 44235, 'min_query_length': 1, 'avg_query_length': 19.69, 'max_query_length': 151, 'unique_query': 29269, 'min_positive_length': 1, 'avg_positive_length': 803.92, 'max_positive_length': 8640, 'unique_positive': 217712, 'min_negative_length': 1, 'avg_negative_length': 808.5, 'max_negative_length': 4441, 'unique_negative': 39551, 'hf_subset_descriptive_stats': {'us': {'num_samples': 21296, 'number_of_characters': 186915609, 'num_positive': 189375, 'num_negative': 25463, 'min_query_length': 1, 'avg_query_length': 21.44, 'max_query_length': 151, 'unique_query': 21296, 'min_positive_length': 1, 'avg_positive_length': 868.37, 'max_positive_length': 5545, 'unique_positive': 150734, 'min_negative_length': 1, 'avg_negative_length': 864.45, 'max_negative_length': 3779, 'unique_negative': 23073}, 'es': {'num_samples': 3703, 'number_of_characters': 48861389, 'num_positive': 39110, 'num_negative': 10183, 'min_query_length': 3, 'avg_query_length': 20.68, 'max_query_length': 59, 'unique_query': 3703, 'min_positive_length': 1, 'avg_positive_length': 980.96, 'max_positive_length': 8640, 'unique_positive': 32921, 'min_negative_length': 1, 'avg_negative_length': 1023.22, 'max_negative_length': 4441, 'unique_negative': 9285}, 'jp': {'num_samples': 4286, 'number_of_characters': 18761333, 'num_positive': 42931, 'num_negative': 8589, 'min_query_length': 1, 'avg_query_length': 10.15, 'max_query_length': 60, 'unique_query': 4286, 'min_positive_length': 1, 'avg_positive_length': 358.36, 'max_positive_length': 3488, 'unique_positive': 35165, 'min_negative_length': 1, 'avg_negative_length': 388.08, 'max_negative_length': 3940, 'unique_negative': 7289}}}} | | [EcomRetrieval](https://arxiv.org/abs/2203.03367) (Dingkun Long, 2022) | ['cmn'] | Retrieval | s2p | | None | None | | [EightTagsClustering.v2](https://aclanthology.org/2020.lrec-1.207.pdf) (Dadas et al., 2020) | ['pol'] | Clustering | s2s | [Social, Written] | None | None | +| [EmitClassification](https://github.com/oaraque/emit) (Araque et al., 2023) | ['ita'] | MultilabelClassification | s2s | [Social, Written] | None | None | | [EmotionClassification](https://www.aclweb.org/anthology/D18-1404) (Saravia et al., 2018) | ['eng'] | Classification | s2s | [Social, Written] | None | None | | [EncyclopediaVQAIT2ITRetrieval](https://github.com/google-research/google-research/tree/master/encyclopedic_vqa) (Mensink et al., 2023) | ['eng'] | Any2AnyRetrieval | it2it | [Encyclopaedic] | {'test': 72056} | {'test': {'number_of_characters': 88615743, 'num_samples': 72056, 'num_queries': 3743, 'num_documents': 68313, 'min_document_length': 24, 'average_document_length': 1294.37, 'max_document_length': 72928, 'unique_documents': 49186, 'num_document_images': 68313, 'min_query_length': 19, 'average_query_length': 51.7, 'max_query_length': 245, 'unique_queries': 2832, 'num_query_images': 3743, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.31, 'max_relevant_docs_per_query': 75, 'unique_relevant_docs': 2683}} | | [EstQA](https://www.semanticscholar.org/paper/Extractive-Question-Answering-for-Estonian-Language-182912IAPM-Alum%C3%A4e/ea4f60ab36cadca059c880678bc4c51e293a85d6?utm_source=direct_link) (Anu Käver, 2021) | ['est'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | @@ -376,8 +379,11 @@ The following tables give you an overview of the tasks in MTEB. | [Itacola](https://aclanthology.org/2021.findings-emnlp.250/) (Trotta et al., 2021) | ['ita'] | Classification | s2s | [Non-fiction, Spoken, Written] | None | None | | [JCrewBlockerLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [JDReview](https://aclanthology.org/2023.nodalida-1.20/) (Xiao et al., 2023) | ['cmn'] | Classification | s2s | | None | None | +| [JQaRAReranking](https://huggingface.co/datasets/hotchpotch/JQaRA) | ['jpn'] | Reranking | s2s | [Encyclopaedic, Non-fiction, Written] | None | None | | [JSICK](https://github.com/sbintuitions/JMTEB) (Yanaka et al., 2022) | ['jpn'] | STS | s2s | [Web, Written] | None | None | | [JSTS](https://aclanthology.org/2022.lrec-1.317.pdf#page=2.00) (Kurihara et al., 2022) | ['jpn'] | STS | s2s | [Web, Written] | None | None | +| [JaCWIRReranking](https://huggingface.co/datasets/hotchpotch/JaCWIR) | ['jpn'] | Reranking | s2s | [Web, Written] | None | None | +| [JaCWIRRetrieval](https://huggingface.co/datasets/hotchpotch/JaCWIR) | ['jpn'] | Retrieval | s2s | [Web, Written] | None | None | | [JaGovFaqsRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Web, Written] | None | None | | [JaQuADRetrieval](https://arxiv.org/abs/2202.01764) (ByungHoon So, 2022) | ['jpn'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Written] | None | None | | [JaqketRetrieval](https://github.com/kumapo/JAQKET-dataset) (鈴木正敏, 2020) | ['jpn'] | Retrieval | s2p | [Encyclopaedic, Non-fiction, Written] | {'test': 115226} | {'test': {'number_of_characters': 428294530, 'num_samples': 115226, 'num_queries': 997, 'num_documents': 114229, 'min_document_length': 16, 'average_document_length': 0.44, 'max_document_length': 98, 'unique_documents': 114229, 'min_query_length': 8, 'average_query_length': 429532.57, 'max_query_length': 188424, 'unique_queries': 997, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 989}} | @@ -438,6 +444,7 @@ The following tables give you an overview of the tasks in MTEB. | [MIRACLRetrieval](http://miracl.ai/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [MIRACLRetrievalHardNegatives](http://miracl.ai/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [MIRACLVisionRetrieval](https://arxiv.org/pdf/2407.01449) (Radek Osmulski, 2025) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | DocumentUnderstanding | t2i | [Encyclopaedic] | None | None | +| [MKQARetrieval](https://github.com/apple/ml-mkqa) (Shayne Longpre, 2020) | ['ara', 'dan', 'deu', 'eng', 'fin', 'fra', 'heb', 'hun', 'ita', 'jpn', 'khm', 'kor', 'msa', 'nld', 'nor', 'pol', 'por', 'rus', 'spa', 'swe', 'tha', 'tur', 'vie', 'zho'] | Retrieval | s2p | [Written] | None | None | | [MLQARetrieval](https://huggingface.co/datasets/mlqa) (Lewis et al., 2019) | ['ara', 'deu', 'eng', 'hin', 'spa', 'vie', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [MLQuestions](https://github.com/McGill-NLP/MLQuestions) (Kulshreshtha et al., 2021) | ['eng'] | Retrieval | s2p | [Academic, Encyclopaedic, Written] | None | None | | [MLSUMClusteringP2P.v2](https://huggingface.co/datasets/mteb/mlsum) (Scialom et al., 2020) | ['deu', 'fra', 'rus', 'spa'] | Clustering | p2p | [News, Written] | None | None | @@ -488,9 +495,14 @@ The following tables give you an overview of the tasks in MTEB. | [NFCorpus-NL](https://huggingface.co/datasets/clips/beir-nl-nfcorpus) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [NFCorpus-PL](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | | [NIGHTSI2IRetrieval](https://proceedings.neurips.cc/paper_files/paper/2023/hash/9f09f316a3eaf59d9ced5ffaefe97e0f-Abstract-Conference.html) (Fu et al., 2024) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | {'test': 42158} | {'test': {'number_of_characters': 0, 'num_samples': 42158, 'num_queries': 2120, 'num_documents': 40038, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 40038, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 2120, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2120}} | -| [NLPJournalAbsIntroRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | -| [NLPJournalTitleAbsRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | -| [NLPJournalTitleIntroRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | +| [NLPJournalAbsArticleRetrieval](https://huggingface.co/datasets/sbintuitions/JMTEB) (Li et al., 2024) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | +| [NLPJournalAbsArticleRetrieval.V2](https://huggingface.co/datasets/sbintuitions/JMTEB) (Li et al., 2024) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | +| [NLPJournalAbsIntroRetrieval](https://huggingface.co/datasets/sbintuitions/JMTEB) (Li et al., 2024) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | +| [NLPJournalAbsIntroRetrieval.V2](https://huggingface.co/datasets/sbintuitions/JMTEB) (Li et al., 2024) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | +| [NLPJournalTitleAbsRetrieval](https://huggingface.co/datasets/sbintuitions/JMTEB) (Li et al., 2024) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | +| [NLPJournalTitleAbsRetrieval.V2](https://huggingface.co/datasets/sbintuitions/JMTEB) (Li et al., 2024) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | +| [NLPJournalTitleIntroRetrieval](https://huggingface.co/datasets/sbintuitions/JMTEB) (Li et al., 2024) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | +| [NLPJournalTitleIntroRetrieval.V2](https://huggingface.co/datasets/sbintuitions/JMTEB) (Li et al., 2024) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | | [NLPTwitterAnalysisClassification](https://huggingface.co/datasets/hamedhf/nlp_twitter_analysis/tree/main) | ['fas'] | Classification | s2p | [Social] | None | None | | [NLPTwitterAnalysisClustering](https://huggingface.co/datasets/hamedhf/nlp_twitter_analysis/commits/main) | ['fas'] | Clustering | s2s | [Social] | None | None | | [NQ](https://ai.google.com/research/NaturalQuestions/) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | @@ -670,9 +682,9 @@ The following tables give you an overview of the tasks in MTEB. | [SIDClustring](https://www.sid.com/) | ['fas'] | Clustering | p2p | [Academic] | None | None | | [SIQA](https://leaderboard.allenai.org/socialiqa/submissions/get-started) (Sap et al., 2019) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [SKQuadRetrieval](https://huggingface.co/datasets/TUKE-KEMT/retrieval-skquad) | ['slk'] | Retrieval | s2s | [Encyclopaedic] | None | None | -| [SNLHierarchicalClusteringP2P](https://huggingface.co/datasets/navjordj/SNL_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | p2p | [Encyclopaedic, Non-fiction, Written] | None | None | -| [SNLHierarchicalClusteringS2S](https://huggingface.co/datasets/navjordj/SNL_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | s2s | [Encyclopaedic, Non-fiction, Written] | None | None | -| [SNLRetrieval](https://huggingface.co/datasets/navjordj/SNL_summarization) (Navjord et al., 2023) | ['nob'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Written] | None | None | +| [SNLHierarchicalClusteringP2P](https://huggingface.co/datasets/mteb/SNLHierarchicalClusteringP2P) (Navjord et al., 2023) | ['nob'] | Clustering | p2p | [Encyclopaedic, Non-fiction, Written] | None | None | +| [SNLHierarchicalClusteringS2S](https://huggingface.co/datasets/mteb/SNLHierarchicalClusteringS2S) (Navjord et al., 2023) | ['nob'] | Clustering | s2s | [Encyclopaedic, Non-fiction, Written] | None | None | +| [SNLRetrieval](https://huggingface.co/datasets/mteb/SNLRetrieval) (Navjord et al., 2023) | ['nob'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Written] | None | None | | [SOPI2IRetrieval](https://paperswithcode.com/dataset/stanford-online-products) (Oh Song et al., 2016) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | {'test': 240106} | {'test': {'number_of_characters': 0, 'num_samples': 240106, 'num_queries': 120053, 'num_documents': 120053, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 120053, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 120053, 'min_relevant_docs_per_query': 2, 'average_relevant_docs_per_query': 7.0, 'max_relevant_docs_per_query': 12, 'unique_relevant_docs': 120053}} | | [SRNCorpusBitextMining](https://arxiv.org/abs/2212.06383) (Zwennicker et al., 2022) | ['nld', 'srn'] | BitextMining | s2s | [Social, Web, Written] | None | None | | [STL10](https://cs.stanford.edu/~acoates/stl10/) (Coates et al., 2011) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 8000} | {'test': {'num_samples': 8000, 'unique_num_labels': 10, 'min_image_width': 96, 'average_image_width': 96.0, 'max_image_width': 96, 'min_image_height': 96, 'average_image_height': 96.0, 'max_image_height': 96, 'labels': {'0': {'count': 800}, '1': {'count': 800}, '2': {'count': 800}, '3': {'count': 800}, '4': {'count': 800}, '5': {'count': 800}, '6': {'count': 800}, '7': {'count': 800}, '8': {'count': 800}, '9': {'count': 800}}}} | @@ -699,6 +711,7 @@ The following tables give you an overview of the tasks in MTEB. | [SUN397ZeroShot](https://ieeexplore.ieee.org/abstract/document/5539970) (Xiao et al., 2010) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | {'test': 21750} | {'test': {'num_samples': 21750, 'unique_num_labels': 397, 'min_image_width': 125, 'average_image_width': 354.22, 'max_image_width': 696, 'min_image_height': 94, 'average_image_height': 291.17, 'max_image_height': 595, 'min_label_text_length': 17, 'average_label_text_length': 25.9, 'max_label_text_length': 41, 'labels': {'227': {'count': 439}, '213': {'count': 335}, '53': {'count': 23}, '350': {'count': 40}, '73': {'count': 38}, '316': {'count': 63}, '177': {'count': 80}, '25': {'count': 39}, '275': {'count': 31}, '328': {'count': 33}, '263': {'count': 47}, '239': {'count': 26}, '41': {'count': 213}, '319': {'count': 51}, '91': {'count': 16}, '95': {'count': 183}, '396': {'count': 20}, '259': {'count': 36}, '107': {'count': 167}, '381': {'count': 164}, '174': {'count': 167}, '246': {'count': 44}, '67': {'count': 31}, '374': {'count': 28}, '354': {'count': 22}, '72': {'count': 100}, '97': {'count': 32}, '256': {'count': 57}, '247': {'count': 57}, '159': {'count': 49}, '270': {'count': 135}, '133': {'count': 215}, '197': {'count': 40}, '12': {'count': 38}, '2': {'count': 226}, '115': {'count': 75}, '200': {'count': 93}, '47': {'count': 103}, '9': {'count': 37}, '22': {'count': 76}, '255': {'count': 34}, '267': {'count': 22}, '244': {'count': 93}, '85': {'count': 115}, '342': {'count': 87}, '55': {'count': 50}, '7': {'count': 41}, '337': {'count': 99}, '38': {'count': 28}, '269': {'count': 69}, '106': {'count': 15}, '298': {'count': 27}, '361': {'count': 53}, '8': {'count': 108}, '166': {'count': 47}, '280': {'count': 51}, '35': {'count': 61}, '147': {'count': 82}, '214': {'count': 26}, '284': {'count': 28}, '286': {'count': 66}, '113': {'count': 67}, '83': {'count': 38}, '82': {'count': 236}, '365': {'count': 17}, '242': {'count': 116}, '186': {'count': 38}, '87': {'count': 111}, '274': {'count': 48}, '27': {'count': 95}, '283': {'count': 22}, '4': {'count': 76}, '334': {'count': 139}, '364': {'count': 21}, '48': {'count': 408}, '311': {'count': 41}, '101': {'count': 64}, '131': {'count': 55}, '172': {'count': 31}, '355': {'count': 28}, '308': {'count': 56}, '5': {'count': 47}, '318': {'count': 155}, '86': {'count': 87}, '46': {'count': 230}, '111': {'count': 69}, '88': {'count': 54}, '23': {'count': 47}, '70': {'count': 61}, '217': {'count': 34}, '11': {'count': 76}, '193': {'count': 207}, '0': {'count': 99}, '303': {'count': 23}, '324': {'count': 47}, '377': {'count': 19}, '345': {'count': 39}, '154': {'count': 49}, '393': {'count': 68}, '152': {'count': 58}, '317': {'count': 27}, '384': {'count': 46}, '257': {'count': 38}, '294': {'count': 47}, '145': {'count': 23}, '289': {'count': 33}, '375': {'count': 19}, '57': {'count': 42}, '15': {'count': 62}, '109': {'count': 24}, '139': {'count': 24}, '66': {'count': 26}, '340': {'count': 32}, '150': {'count': 41}, '118': {'count': 105}, '333': {'count': 27}, '126': {'count': 55}, '366': {'count': 116}, '358': {'count': 151}, '251': {'count': 37}, '309': {'count': 35}, '54': {'count': 20}, '327': {'count': 38}, '3': {'count': 60}, '21': {'count': 56}, '17': {'count': 62}, '146': {'count': 84}, '94': {'count': 42}, '243': {'count': 48}, '335': {'count': 85}, '245': {'count': 141}, '279': {'count': 187}, '360': {'count': 25}, '192': {'count': 105}, '49': {'count': 31}, '230': {'count': 81}, '357': {'count': 22}, '64': {'count': 72}, '112': {'count': 26}, '338': {'count': 70}, '216': {'count': 99}, '234': {'count': 183}, '300': {'count': 153}, '188': {'count': 48}, '254': {'count': 41}, '184': {'count': 183}, '373': {'count': 47}, '221': {'count': 86}, '84': {'count': 49}, '81': {'count': 119}, '161': {'count': 97}, '352': {'count': 21}, '105': {'count': 43}, '39': {'count': 59}, '383': {'count': 40}, '341': {'count': 56}, '63': {'count': 158}, '125': {'count': 29}, '302': {'count': 83}, '262': {'count': 40}, '392': {'count': 51}, '326': {'count': 173}, '228': {'count': 93}, '339': {'count': 25}, '80': {'count': 73}, '30': {'count': 42}, '264': {'count': 112}, '56': {'count': 94}, '321': {'count': 16}, '395': {'count': 52}, '68': {'count': 45}, '211': {'count': 45}, '44': {'count': 26}, '299': {'count': 21}, '220': {'count': 35}, '61': {'count': 20}, '138': {'count': 55}, '108': {'count': 111}, '10': {'count': 35}, '386': {'count': 28}, '297': {'count': 49}, '210': {'count': 36}, '175': {'count': 77}, '260': {'count': 68}, '391': {'count': 69}, '102': {'count': 77}, '26': {'count': 44}, '232': {'count': 54}, '6': {'count': 158}, '124': {'count': 43}, '14': {'count': 23}, '201': {'count': 39}, '168': {'count': 18}, '202': {'count': 26}, '140': {'count': 31}, '261': {'count': 60}, '104': {'count': 27}, '356': {'count': 22}, '34': {'count': 147}, '225': {'count': 111}, '60': {'count': 84}, '156': {'count': 35}, '237': {'count': 45}, '268': {'count': 87}, '310': {'count': 31}, '249': {'count': 73}, '281': {'count': 46}, '75': {'count': 89}, '77': {'count': 53}, '132': {'count': 45}, '235': {'count': 42}, '336': {'count': 84}, '123': {'count': 27}, '349': {'count': 90}, '180': {'count': 49}, '378': {'count': 17}, '332': {'count': 30}, '185': {'count': 29}, '389': {'count': 60}, '382': {'count': 77}, '198': {'count': 54}, '74': {'count': 48}, '231': {'count': 85}, '76': {'count': 54}, '151': {'count': 64}, '182': {'count': 17}, '209': {'count': 39}, '344': {'count': 37}, '204': {'count': 67}, '329': {'count': 23}, '380': {'count': 91}, '388': {'count': 32}, '116': {'count': 29}, '24': {'count': 103}, '199': {'count': 33}, '369': {'count': 14}, '359': {'count': 77}, '325': {'count': 39}, '323': {'count': 34}, '162': {'count': 35}, '33': {'count': 46}, '129': {'count': 21}, '287': {'count': 30}, '155': {'count': 24}, '170': {'count': 157}, '296': {'count': 40}, '110': {'count': 102}, '304': {'count': 21}, '164': {'count': 37}, '278': {'count': 23}, '71': {'count': 18}, '194': {'count': 24}, '136': {'count': 117}, '103': {'count': 134}, '330': {'count': 26}, '347': {'count': 26}, '206': {'count': 50}, '178': {'count': 43}, '362': {'count': 26}, '119': {'count': 111}, '208': {'count': 33}, '165': {'count': 44}, '90': {'count': 36}, '167': {'count': 40}, '187': {'count': 26}, '99': {'count': 50}, '390': {'count': 64}, '205': {'count': 16}, '65': {'count': 30}, '293': {'count': 23}, '223': {'count': 19}, '96': {'count': 31}, '305': {'count': 44}, '100': {'count': 57}, '385': {'count': 18}, '78': {'count': 42}, '59': {'count': 20}, '37': {'count': 59}, '219': {'count': 76}, '212': {'count': 28}, '1': {'count': 26}, '122': {'count': 35}, '92': {'count': 62}, '43': {'count': 39}, '196': {'count': 56}, '19': {'count': 25}, '128': {'count': 35}, '376': {'count': 77}, '313': {'count': 30}, '114': {'count': 54}, '121': {'count': 31}, '169': {'count': 62}, '331': {'count': 55}, '238': {'count': 16}, '179': {'count': 31}, '127': {'count': 31}, '370': {'count': 98}, '149': {'count': 47}, '346': {'count': 41}, '250': {'count': 22}, '276': {'count': 25}, '163': {'count': 43}, '18': {'count': 33}, '282': {'count': 23}, '215': {'count': 33}, '258': {'count': 60}, '240': {'count': 29}, '233': {'count': 14}, '93': {'count': 27}, '69': {'count': 23}, '266': {'count': 26}, '387': {'count': 55}, '141': {'count': 18}, '191': {'count': 26}, '183': {'count': 42}, '271': {'count': 22}, '120': {'count': 32}, '98': {'count': 53}, '29': {'count': 34}, '28': {'count': 21}, '144': {'count': 26}, '351': {'count': 50}, '368': {'count': 20}, '314': {'count': 27}, '45': {'count': 17}, '218': {'count': 50}, '348': {'count': 25}, '157': {'count': 35}, '117': {'count': 24}, '367': {'count': 24}, '13': {'count': 31}, '363': {'count': 22}, '79': {'count': 28}, '312': {'count': 27}, '372': {'count': 29}, '189': {'count': 21}, '50': {'count': 22}, '160': {'count': 35}, '16': {'count': 39}, '222': {'count': 21}, '58': {'count': 37}, '153': {'count': 64}, '62': {'count': 21}, '290': {'count': 25}, '292': {'count': 24}, '285': {'count': 25}, '343': {'count': 32}, '301': {'count': 19}, '190': {'count': 46}, '195': {'count': 24}, '135': {'count': 30}, '315': {'count': 25}, '203': {'count': 29}, '307': {'count': 18}, '142': {'count': 25}, '173': {'count': 28}, '236': {'count': 41}, '171': {'count': 23}, '371': {'count': 17}, '130': {'count': 15}, '277': {'count': 39}, '248': {'count': 22}, '181': {'count': 35}, '40': {'count': 20}, '322': {'count': 15}, '273': {'count': 23}, '148': {'count': 23}, '295': {'count': 25}, '32': {'count': 21}, '320': {'count': 25}, '137': {'count': 32}, '253': {'count': 36}, '31': {'count': 19}, '306': {'count': 27}, '51': {'count': 19}, '52': {'count': 29}, '176': {'count': 31}, '241': {'count': 23}, '265': {'count': 32}, '394': {'count': 26}, '158': {'count': 26}, '226': {'count': 28}, '288': {'count': 21}, '353': {'count': 19}, '291': {'count': 21}, '224': {'count': 26}, '36': {'count': 38}, '20': {'count': 22}, '252': {'count': 18}, '134': {'count': 24}, '143': {'count': 21}, '207': {'count': 28}, '89': {'count': 16}, '272': {'count': 23}, '379': {'count': 24}, '229': {'count': 20}, '42': {'count': 23}}}} | | [SadeemQuestionRetrieval](https://huggingface.co/datasets/sadeem-ai/sadeem-ar-eval-retrieval-questions) | ['ara'] | Retrieval | s2p | [Written, Written] | None | None | | [SanskritShlokasClassification](https://github.com/goru001/nlp-for-sanskrit) (Arora et al., 2020) | ['san'] | Classification | s2s | [Religious, Written] | None | None | +| [SardiStanceClassification](https://github.com/mirkolai/evalita-sardistance) (Cignarella et al., 2020) | ['ita'] | Classification | s2s | [Social] | None | None | | [ScalaClassification](https://aclanthology.org/2023.nodalida-1.20/) (Nielsen et al., 2023) | ['dan', 'nno', 'nob', 'swe'] | Classification | s2s | [Blog, Fiction, News, Non-fiction, Spoken, Web, Written] | None | None | | [ScandiSentClassification](https://github.com/timpal0l/ScandiSent) (Isbister et al., 2021) | ['dan', 'eng', 'fin', 'nob', 'swe'] | Classification | s2s | [Reviews, Written] | None | None | | [SciDocsRR](https://allenai.org/data/scidocs) (Cohan et al., 2020) | ['eng'] | Reranking | s2s | [Academic, Non-fiction, Written] | None | None | @@ -902,6 +915,7 @@ The following tables give you an overview of the tasks in MTEB. | [Winoground](https://openaccess.thecvf.com/content/CVPR2022/html/Thrush_Winoground_Probing_Vision_and_Language_Models_for_Visio-Linguistic_Compositionality_CVPR_2022_paper) (Tristan Thrush, 2022) | ['eng'] | Compositionality | i2t | [Social] | {'test': 400} | {'test': {'num_samples': 400, 'num_images': 800, 'num_texts': 800, 'num_unique_texts': 800, 'min_text_length': 8, 'average_text_length': 45.47, 'max_text_length': 151}} | | [WisesightSentimentClassification](https://github.com/PyThaiNLP/wisesight-sentiment) (Suriyawongkul et al., 2019) | ['tha'] | Classification | s2s | [News, Social, Written] | None | None | | [XFlickr30kCoT2IRetrieval](https://proceedings.mlr.press/v162/bugliarello22a/bugliarello22a.pdf) (Bugliarello et al., 2022) | ['deu', 'eng', 'ind', 'jpn', 'rus', 'spa', 'tur', 'zho'] | Any2AnyMultilingualRetrieval | t2i | [Encyclopaedic, Written] | {'test': 32000} | {'test': {'number_of_characters': 1149877, 'num_samples': 32000, 'num_queries': 16000, 'num_documents': 16000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 16000, 'min_query_length': 12, 'average_query_length': 71.87, 'max_query_length': 385, 'unique_queries': 15987, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 16000, 'hf_subset_descriptive_stats': {'de': {'number_of_characters': 132154, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 4, 'average_query_length': 66.08, 'max_query_length': 220, 'unique_queries': 1994, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}, 'en': {'number_of_characters': 153801, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 34, 'average_query_length': 76.9, 'max_query_length': 377, 'unique_queries': 2000, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}, 'es': {'number_of_characters': 160049, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 23, 'average_query_length': 80.02, 'max_query_length': 342, 'unique_queries': 2000, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}, 'id': {'number_of_characters': 167858, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 4, 'average_query_length': 83.93, 'max_query_length': 211, 'unique_queries': 2000, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}, 'ja': {'number_of_characters': 75480, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 9, 'average_query_length': 37.74, 'max_query_length': 179, 'unique_queries': 2000, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}, 'ru': {'number_of_characters': 149947, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 10, 'average_query_length': 74.97, 'max_query_length': 294, 'unique_queries': 1997, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}, 'tr': {'number_of_characters': 136134, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 19, 'average_query_length': 68.07, 'max_query_length': 199, 'unique_queries': 1997, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}, 'zh': {'number_of_characters': 46454, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 10, 'average_query_length': 23.23, 'max_query_length': 66, 'unique_queries': 1999, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}}}} | +| [XGlueWPRReranking](https://github.com/microsoft/XGLUE) (Zeman et al., 2019) | ['deu', 'eng', 'fra', 'ita', 'por', 'spa', 'zho'] | Reranking | s2p | [Written] | None | None | | [XM3600T2IRetrieval](https://aclanthology.org/2022.emnlp-main.45/) (Thapliyal et al., 2022) | ['ara', 'ben', 'ces', 'dan', 'deu', 'ell', 'eng', 'fas', 'fil', 'fin', 'fra', 'heb', 'hin', 'hrv', 'hun', 'ind', 'ita', 'jpn', 'kor', 'mri', 'nld', 'nor', 'pol', 'por', 'quz', 'ron', 'rus', 'spa', 'swa', 'swe', 'tel', 'tha', 'tur', 'ukr', 'vie', 'zho'] | Any2AnyMultilingualRetrieval | t2i | [Encyclopaedic, Written] | {'test': 390975} | {'test': {'number_of_characters': 17009034, 'num_samples': 390975, 'num_queries': 261375, 'num_documents': 129600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 129600, 'min_query_length': 9, 'average_query_length': 65.08, 'max_query_length': 532, 'unique_queries': 259932, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 129600, 'hf_subset_descriptive_stats': {'ar': {'number_of_characters': 310802, 'num_samples': 10967, 'num_queries': 7367, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 8, 'average_query_length': 42.19, 'max_query_length': 208, 'unique_queries': 7339, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'bn': {'number_of_characters': 223622, 'num_samples': 7200, 'num_queries': 3600, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 28, 'average_query_length': 62.12, 'max_query_length': 139, 'unique_queries': 3594, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'cs': {'number_of_characters': 282069, 'num_samples': 10807, 'num_queries': 7207, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 3, 'average_query_length': 39.14, 'max_query_length': 266, 'unique_queries': 6814, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'da': {'number_of_characters': 351028, 'num_samples': 10864, 'num_queries': 7264, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 7, 'average_query_length': 48.32, 'max_query_length': 158, 'unique_queries': 7246, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'de': {'number_of_characters': 660790, 'num_samples': 12243, 'num_queries': 8643, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 11, 'average_query_length': 76.45, 'max_query_length': 334, 'unique_queries': 8643, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'el': {'number_of_characters': 370363, 'num_samples': 10804, 'num_queries': 7204, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 4, 'average_query_length': 51.41, 'max_query_length': 262, 'unique_queries': 7100, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'en': {'number_of_characters': 356488, 'num_samples': 10800, 'num_queries': 7200, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 6, 'average_query_length': 49.51, 'max_query_length': 148, 'unique_queries': 7129, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'es': {'number_of_characters': 485004, 'num_samples': 12214, 'num_queries': 8614, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 12, 'average_query_length': 56.3, 'max_query_length': 179, 'unique_queries': 8605, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'fa': {'number_of_characters': 430055, 'num_samples': 10845, 'num_queries': 7245, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 4, 'average_query_length': 59.36, 'max_query_length': 289, 'unique_queries': 7242, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'fi': {'number_of_characters': 464334, 'num_samples': 10727, 'num_queries': 7127, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 1, 'average_query_length': 65.15, 'max_query_length': 336, 'unique_queries': 7110, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'fil': {'number_of_characters': 480287, 'num_samples': 10709, 'num_queries': 7109, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 4, 'average_query_length': 67.56, 'max_query_length': 332, 'unique_queries': 7016, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'fr': {'number_of_characters': 595836, 'num_samples': 12162, 'num_queries': 8562, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 15, 'average_query_length': 69.59, 'max_query_length': 173, 'unique_queries': 8560, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'he': {'number_of_characters': 457775, 'num_samples': 10800, 'num_queries': 7200, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 3, 'average_query_length': 63.58, 'max_query_length': 453, 'unique_queries': 7190, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'hi': {'number_of_characters': 509092, 'num_samples': 12103, 'num_queries': 8503, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 3, 'average_query_length': 59.87, 'max_query_length': 188, 'unique_queries': 8422, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'hr': {'number_of_characters': 420595, 'num_samples': 10880, 'num_queries': 7280, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 3, 'average_query_length': 57.77, 'max_query_length': 271, 'unique_queries': 7224, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'hu': {'number_of_characters': 436677, 'num_samples': 10816, 'num_queries': 7216, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 5, 'average_query_length': 60.52, 'max_query_length': 393, 'unique_queries': 7209, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'id': {'number_of_characters': 666387, 'num_samples': 10726, 'num_queries': 7126, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 16, 'average_query_length': 93.51, 'max_query_length': 286, 'unique_queries': 7125, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'it': {'number_of_characters': 608604, 'num_samples': 12071, 'num_queries': 8471, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 15, 'average_query_length': 71.85, 'max_query_length': 201, 'unique_queries': 8470, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'ja': {'number_of_characters': 186672, 'num_samples': 10785, 'num_queries': 7185, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 1, 'average_query_length': 25.98, 'max_query_length': 97, 'unique_queries': 7175, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'ko': {'number_of_characters': 188812, 'num_samples': 11250, 'num_queries': 7650, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 1, 'average_query_length': 24.68, 'max_query_length': 113, 'unique_queries': 7644, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'mi': {'number_of_characters': 262800, 'num_samples': 8332, 'num_queries': 4732, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 7, 'average_query_length': 55.54, 'max_query_length': 304, 'unique_queries': 4707, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'nl': {'number_of_characters': 370231, 'num_samples': 11659, 'num_queries': 8059, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 4, 'average_query_length': 45.94, 'max_query_length': 173, 'unique_queries': 8004, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'no': {'number_of_characters': 391381, 'num_samples': 10813, 'num_queries': 7213, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 5, 'average_query_length': 54.26, 'max_query_length': 162, 'unique_queries': 7191, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'pl': {'number_of_characters': 411189, 'num_samples': 10741, 'num_queries': 7141, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 4, 'average_query_length': 57.58, 'max_query_length': 226, 'unique_queries': 7117, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'pt': {'number_of_characters': 446873, 'num_samples': 10843, 'num_queries': 7243, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 10, 'average_query_length': 61.7, 'max_query_length': 324, 'unique_queries': 7220, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'quz': {'number_of_characters': 278263, 'num_samples': 10800, 'num_queries': 7200, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 2, 'average_query_length': 38.65, 'max_query_length': 234, 'unique_queries': 7130, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'ro': {'number_of_characters': 629977, 'num_samples': 10723, 'num_queries': 7123, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 5, 'average_query_length': 88.44, 'max_query_length': 524, 'unique_queries': 7122, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'ru': {'number_of_characters': 477558, 'num_samples': 10800, 'num_queries': 7200, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 11, 'average_query_length': 66.33, 'max_query_length': 232, 'unique_queries': 7194, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'sv': {'number_of_characters': 339400, 'num_samples': 10873, 'num_queries': 7273, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 6, 'average_query_length': 46.67, 'max_query_length': 174, 'unique_queries': 7199, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'sw': {'number_of_characters': 444085, 'num_samples': 10646, 'num_queries': 7046, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 2, 'average_query_length': 63.03, 'max_query_length': 299, 'unique_queries': 7014, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'te': {'number_of_characters': 341340, 'num_samples': 10800, 'num_queries': 7200, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 16, 'average_query_length': 47.41, 'max_query_length': 132, 'unique_queries': 7062, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'th': {'number_of_characters': 344730, 'num_samples': 10800, 'num_queries': 7200, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 8, 'average_query_length': 47.88, 'max_query_length': 147, 'unique_queries': 7170, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'tr': {'number_of_characters': 458639, 'num_samples': 10833, 'num_queries': 7233, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 7, 'average_query_length': 63.41, 'max_query_length': 453, 'unique_queries': 7224, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'uk': {'number_of_characters': 474311, 'num_samples': 10815, 'num_queries': 7215, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 10, 'average_query_length': 65.74, 'max_query_length': 372, 'unique_queries': 7206, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'vi': {'number_of_characters': 582546, 'num_samples': 10950, 'num_queries': 7350, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 6, 'average_query_length': 79.26, 'max_query_length': 287, 'unique_queries': 7350, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'zh': {'number_of_characters': 165110, 'num_samples': 10774, 'num_queries': 7174, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 1, 'average_query_length': 23.02, 'max_query_length': 96, 'unique_queries': 7165, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}}}} | | XMarket (Bonab et al., 2021) | ['deu', 'eng', 'spa'] | Retrieval | s2p | | None | None | | [XNLI](https://aclanthology.org/D18-1269/) (Conneau et al., 2018) | ['ara', 'bul', 'deu', 'ell', 'eng', 'fra', 'hin', 'rus', 'spa', 'swa', 'tha', 'tur', 'vie', 'zho'] | PairClassification | s2s | [Fiction, Government, Non-fiction, Written] | {'test': 19110, 'validation': 19110} | {'test': {'num_samples': 19110, 'number_of_characters': 2907145, 'min_sentence1_length': 3, 'avg_sentence1_length': 103.24, 'max_sentence1_length': 401, 'unique_sentence1': 15328, 'min_sentence2_length': 2, 'avg_sentence2_length': 48.89, 'max_sentence2_length': 187, 'unique_sentence2': 19104, 'unique_labels': 2, 'labels': {'0': {'count': 9562}, '1': {'count': 9548}}, 'hf_subset_descriptive_stats': {'ar': {'num_samples': 1365, 'number_of_characters': 179591, 'min_sentence1_length': 11, 'avg_sentence1_length': 89.57, 'max_sentence1_length': 242, 'unique_sentence1': 1095, 'min_sentence2_length': 8, 'avg_sentence2_length': 41.99, 'max_sentence2_length': 115, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'bg': {'num_samples': 1365, 'number_of_characters': 220646, 'min_sentence1_length': 14, 'avg_sentence1_length': 110.02, 'max_sentence1_length': 303, 'unique_sentence1': 1095, 'min_sentence2_length': 8, 'avg_sentence2_length': 51.63, 'max_sentence2_length': 150, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'de': {'num_samples': 1365, 'number_of_characters': 241224, 'min_sentence1_length': 3, 'avg_sentence1_length': 119.93, 'max_sentence1_length': 301, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 56.79, 'max_sentence2_length': 187, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'el': {'num_samples': 1365, 'number_of_characters': 240222, 'min_sentence1_length': 13, 'avg_sentence1_length': 119.05, 'max_sentence1_length': 344, 'unique_sentence1': 1095, 'min_sentence2_length': 13, 'avg_sentence2_length': 56.93, 'max_sentence2_length': 172, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'en': {'num_samples': 1365, 'number_of_characters': 212223, 'min_sentence1_length': 19, 'avg_sentence1_length': 105.67, 'max_sentence1_length': 268, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 49.8, 'max_sentence2_length': 137, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'es': {'num_samples': 1365, 'number_of_characters': 232207, 'min_sentence1_length': 11, 'avg_sentence1_length': 115.43, 'max_sentence1_length': 385, 'unique_sentence1': 1094, 'min_sentence2_length': 8, 'avg_sentence2_length': 54.68, 'max_sentence2_length': 163, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'fr': {'num_samples': 1365, 'number_of_characters': 245259, 'min_sentence1_length': 9, 'avg_sentence1_length': 121.1, 'max_sentence1_length': 327, 'unique_sentence1': 1095, 'min_sentence2_length': 10, 'avg_sentence2_length': 58.58, 'max_sentence2_length': 169, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'hi': {'num_samples': 1365, 'number_of_characters': 211312, 'min_sentence1_length': 16, 'avg_sentence1_length': 104.63, 'max_sentence1_length': 401, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 50.17, 'max_sentence2_length': 162, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'ru': {'num_samples': 1365, 'number_of_characters': 222797, 'min_sentence1_length': 11, 'avg_sentence1_length': 110.77, 'max_sentence1_length': 306, 'unique_sentence1': 1095, 'min_sentence2_length': 8, 'avg_sentence2_length': 52.45, 'max_sentence2_length': 167, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'sw': {'num_samples': 1365, 'number_of_characters': 210103, 'min_sentence1_length': 10, 'avg_sentence1_length': 104.44, 'max_sentence1_length': 266, 'unique_sentence1': 1094, 'min_sentence2_length': 2, 'avg_sentence2_length': 49.48, 'max_sentence2_length': 146, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'th': {'num_samples': 1365, 'number_of_characters': 192788, 'min_sentence1_length': 12, 'avg_sentence1_length': 96.69, 'max_sentence1_length': 262, 'unique_sentence1': 1095, 'min_sentence2_length': 6, 'avg_sentence2_length': 44.54, 'max_sentence2_length': 129, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'tr': {'num_samples': 1365, 'number_of_characters': 208658, 'min_sentence1_length': 15, 'avg_sentence1_length': 103.68, 'max_sentence1_length': 255, 'unique_sentence1': 1095, 'min_sentence2_length': 6, 'avg_sentence2_length': 49.19, 'max_sentence2_length': 140, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'vi': {'num_samples': 1365, 'number_of_characters': 223549, 'min_sentence1_length': 14, 'avg_sentence1_length': 111.31, 'max_sentence1_length': 265, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 52.46, 'max_sentence2_length': 143, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'zh': {'num_samples': 1365, 'number_of_characters': 66566, 'min_sentence1_length': 4, 'avg_sentence1_length': 33.04, 'max_sentence1_length': 112, 'unique_sentence1': 1095, 'min_sentence2_length': 3, 'avg_sentence2_length': 15.73, 'max_sentence2_length': 59, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}}}, 'validation': {'num_samples': 19110, 'number_of_characters': 2909058, 'min_sentence1_length': 5, 'avg_sentence1_length': 103.21, 'max_sentence1_length': 323, 'unique_sentence1': 11171, 'min_sentence2_length': 3, 'avg_sentence2_length': 49.02, 'max_sentence2_length': 172, 'unique_sentence2': 19101, 'unique_labels': 2, 'labels': {'0': {'count': 9562}, '1': {'count': 9548}}, 'hf_subset_descriptive_stats': {'ar': {'num_samples': 1365, 'number_of_characters': 177355, 'min_sentence1_length': 13, 'avg_sentence1_length': 88.32, 'max_sentence1_length': 214, 'unique_sentence1': 798, 'min_sentence2_length': 6, 'avg_sentence2_length': 41.61, 'max_sentence2_length': 137, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'bg': {'num_samples': 1365, 'number_of_characters': 219988, 'min_sentence1_length': 16, 'avg_sentence1_length': 109.2, 'max_sentence1_length': 316, 'unique_sentence1': 798, 'min_sentence2_length': 10, 'avg_sentence2_length': 51.97, 'max_sentence2_length': 151, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'de': {'num_samples': 1365, 'number_of_characters': 241852, 'min_sentence1_length': 20, 'avg_sentence1_length': 119.81, 'max_sentence1_length': 298, 'unique_sentence1': 798, 'min_sentence2_length': 12, 'avg_sentence2_length': 57.37, 'max_sentence2_length': 162, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'el': {'num_samples': 1365, 'number_of_characters': 241275, 'min_sentence1_length': 16, 'avg_sentence1_length': 119.88, 'max_sentence1_length': 302, 'unique_sentence1': 798, 'min_sentence2_length': 6, 'avg_sentence2_length': 56.88, 'max_sentence2_length': 171, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'en': {'num_samples': 1365, 'number_of_characters': 212384, 'min_sentence1_length': 20, 'avg_sentence1_length': 105.72, 'max_sentence1_length': 271, 'unique_sentence1': 798, 'min_sentence2_length': 8, 'avg_sentence2_length': 49.88, 'max_sentence2_length': 139, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'es': {'num_samples': 1365, 'number_of_characters': 232451, 'min_sentence1_length': 14, 'avg_sentence1_length': 115.17, 'max_sentence1_length': 265, 'unique_sentence1': 798, 'min_sentence2_length': 7, 'avg_sentence2_length': 55.12, 'max_sentence2_length': 148, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'fr': {'num_samples': 1365, 'number_of_characters': 246857, 'min_sentence1_length': 19, 'avg_sentence1_length': 121.76, 'max_sentence1_length': 323, 'unique_sentence1': 798, 'min_sentence2_length': 11, 'avg_sentence2_length': 59.09, 'max_sentence2_length': 172, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'hi': {'num_samples': 1365, 'number_of_characters': 212269, 'min_sentence1_length': 18, 'avg_sentence1_length': 105.06, 'max_sentence1_length': 277, 'unique_sentence1': 798, 'min_sentence2_length': 7, 'avg_sentence2_length': 50.44, 'max_sentence2_length': 152, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'ru': {'num_samples': 1365, 'number_of_characters': 221152, 'min_sentence1_length': 15, 'avg_sentence1_length': 109.75, 'max_sentence1_length': 310, 'unique_sentence1': 798, 'min_sentence2_length': 8, 'avg_sentence2_length': 52.27, 'max_sentence2_length': 140, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'sw': {'num_samples': 1365, 'number_of_characters': 210482, 'min_sentence1_length': 13, 'avg_sentence1_length': 104.32, 'max_sentence1_length': 264, 'unique_sentence1': 798, 'min_sentence2_length': 8, 'avg_sentence2_length': 49.88, 'max_sentence2_length': 153, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'th': {'num_samples': 1365, 'number_of_characters': 192640, 'min_sentence1_length': 7, 'avg_sentence1_length': 97.28, 'max_sentence1_length': 255, 'unique_sentence1': 798, 'min_sentence2_length': 3, 'avg_sentence2_length': 43.84, 'max_sentence2_length': 140, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'tr': {'num_samples': 1365, 'number_of_characters': 208305, 'min_sentence1_length': 15, 'avg_sentence1_length': 102.97, 'max_sentence1_length': 269, 'unique_sentence1': 798, 'min_sentence2_length': 10, 'avg_sentence2_length': 49.64, 'max_sentence2_length': 139, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'vi': {'num_samples': 1365, 'number_of_characters': 224811, 'min_sentence1_length': 18, 'avg_sentence1_length': 112.26, 'max_sentence1_length': 323, 'unique_sentence1': 798, 'min_sentence2_length': 9, 'avg_sentence2_length': 52.43, 'max_sentence2_length': 159, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'zh': {'num_samples': 1365, 'number_of_characters': 67237, 'min_sentence1_length': 5, 'avg_sentence1_length': 33.41, 'max_sentence1_length': 135, 'unique_sentence1': 798, 'min_sentence2_length': 3, 'avg_sentence2_length': 15.85, 'max_sentence2_length': 66, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}}}} | @@ -993,7 +1007,7 @@ The following tables give you an overview of the tasks in MTEB. | apu | Apurinã | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | apw | Western Apache | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | apz | Safeyoka | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ara | Arabic | Unclassified | 0 | 2 | 0 | 4 | 12 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 10 | 2 | 0 | 0 | 0 | 0 | 2 | 0 | 37 | +| ara | Arabic | Unclassified | 0 | 2 | 0 | 4 | 12 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 11 | 2 | 0 | 0 | 0 | 0 | 2 | 0 | 38 | | arb | Standard Arabic | Afro-Asiatic | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | | are | Western Arrarnta | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | arl | Arabela | Zaparoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1180,9 +1194,9 @@ The following tables give you an overview of the tasks in MTEB. | daa | Dangaléat | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | dad | Marik | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | dah | Gwahatike | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dan | Danish | Indo-European | 0 | 2 | 0 | 8 | 10 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 31 | +| dan | Danish | Indo-European | 0 | 2 | 0 | 8 | 10 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 32 | | ded | Dedua | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| deu | German | Indo-European | 0 | 2 | 0 | 8 | 14 | 7 | 0 | 4 | 0 | 0 | 0 | 0 | 1 | 7 | 2 | 20 | 4 | 0 | 0 | 0 | 0 | 4 | 0 | 73 | +| deu | German | Indo-European | 0 | 2 | 0 | 8 | 14 | 7 | 0 | 4 | 0 | 0 | 0 | 0 | 1 | 7 | 3 | 21 | 4 | 0 | 0 | 0 | 0 | 4 | 0 | 75 | | dgc | Casiguran Dumagat Agta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | dgr | Dogrib | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | dgz | Daga | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1210,7 +1224,7 @@ The following tables give you an overview of the tasks in MTEB. | ell | Modern Greek (1453-) | Indo-European | 0 | 2 | 0 | 5 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | | emi | Mussau-Emira | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | emp | Northern Emberá | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eng | English | Indo-European | 0 | 3 | 49 | 20 | 161 | 21 | 7 | 15 | 22 | 5 | 0 | 3 | 1 | 13 | 9 | 121 | 13 | 2 | 1 | 6 | 7 | 3 | 24 | 506 | +| eng | English | Indo-European | 0 | 3 | 49 | 20 | 161 | 21 | 7 | 15 | 22 | 5 | 0 | 3 | 1 | 13 | 10 | 122 | 13 | 2 | 1 | 6 | 7 | 3 | 24 | 508 | | enq | Enga | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | epo | Esperanto | Artificial Language | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | | eri | Ogea | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1228,10 +1242,10 @@ The following tables give you an overview of the tasks in MTEB. | ffm | Maasina Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | fij | Fijian | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | | fil | Filipino | Austronesian | 0 | 1 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| fin | Finnish | Uralic | 0 | 1 | 0 | 5 | 6 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 2 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 25 | +| fin | Finnish | Uralic | 0 | 1 | 0 | 5 | 6 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 2 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | | fon | Fon | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | | for | Fore | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fra | French | Indo-European | 0 | 1 | 0 | 9 | 13 | 8 | 0 | 4 | 0 | 0 | 0 | 0 | 1 | 6 | 3 | 17 | 4 | 0 | 1 | 0 | 0 | 4 | 0 | 71 | +| fra | French | Indo-European | 0 | 1 | 0 | 9 | 13 | 8 | 0 | 4 | 0 | 0 | 0 | 0 | 1 | 6 | 4 | 18 | 4 | 0 | 1 | 0 | 0 | 4 | 0 | 73 | | fry | Western Frisian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | fuc | Pulaar | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | fue | Borgu Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1286,7 +1300,7 @@ The following tables give you an overview of the tasks in MTEB. | haw | Hawaiian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | hbo | Ancient Hebrew | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | hch | Huichol | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| heb | Hebrew | Afro-Asiatic | 0 | 1 | 0 | 6 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | +| heb | Hebrew | Afro-Asiatic | 0 | 1 | 0 | 6 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | | heg | Helong | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | hin | Hindi | Indo-European | 0 | 1 | 0 | 11 | 12 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 11 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 44 | | hix | Hixkaryána | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1303,7 +1317,7 @@ The following tables give you an overview of the tasks in MTEB. | hto | Minica Huitoto | Huitotoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | hub | Huambisa | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | hui | Huli | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hun | Hungarian | Uralic | 0 | 1 | 0 | 7 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| hun | Hungarian | Uralic | 0 | 1 | 0 | 7 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 17 | | hus | Huastec | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | huu | Murui Huitoto | Huitotoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | huv | San Mateo Del Mar Huave | Huavean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1326,7 +1340,7 @@ The following tables give you an overview of the tasks in MTEB. | ipi | Ipili | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | isl | Icelandic | Indo-European | 0 | 0 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | | isn | Isanzu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ita | Italian | Indo-European | 0 | 1 | 0 | 7 | 9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | 7 | 3 | 0 | 0 | 0 | 0 | 4 | 0 | 36 | +| ita | Italian | Indo-European | 0 | 1 | 0 | 7 | 11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 2 | 8 | 3 | 0 | 0 | 0 | 0 | 4 | 0 | 42 | | iws | Sepik Iwam | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | ixl | Ixil | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | jac | Popti' | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1337,7 +1351,7 @@ The following tables give you an overview of the tasks in MTEB. | jid | Bu (Kaduna State) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | jiv | Shuar | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | jni | Janji | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jpn | Japanese | Japonic | 0 | 3 | 0 | 7 | 8 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 14 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 43 | +| jpn | Japanese | Japonic | 0 | 3 | 0 | 7 | 8 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 5 | 21 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 52 | | jvn | Caribbean Javanese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | kab | Kabyle | Afro-Asiatic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | | kac | Kachin | Sino-Tibetan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | @@ -1365,7 +1379,7 @@ The following tables give you an overview of the tasks in MTEB. | kgk | Kaiwá | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | kgp | Kaingang | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | khk | Halh Mongolian | Mongolic-Khitan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| khm | Khmer | Austroasiatic | 0 | 0 | 0 | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| khm | Khmer | Austroasiatic | 0 | 0 | 0 | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | | khs | Kasua | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | khz | Keapara | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | kik | Kikuyu | Atlantic-Congo | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | @@ -1393,7 +1407,7 @@ The following tables give you an overview of the tasks in MTEB. | knj | Western Kanjobal | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | knv | Tabo | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | kon | Kongo | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kor | Korean | Koreanic | 0 | 2 | 0 | 6 | 8 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 3 | 1 | 10 | 3 | 0 | 0 | 0 | 0 | 2 | 0 | 40 | +| kor | Korean | Koreanic | 0 | 2 | 0 | 6 | 8 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 3 | 1 | 11 | 3 | 0 | 0 | 0 | 0 | 2 | 0 | 41 | | kos | Kosraean | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | kpf | Komba | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | kpg | Kapingamarangi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1538,7 +1552,7 @@ The following tables give you an overview of the tasks in MTEB. | mqb | Mbuko | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | mqj | Mamasa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | mri | Maori | Austronesian | 0 | 1 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| msa | Malay (macrolanguage) | Unclassified | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| msa | Malay (macrolanguage) | Unclassified | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | | msb | Masbatenyo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | msc | Sankaran Maninka | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | msk | Mansaka | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1599,7 +1613,7 @@ The following tables give you an overview of the tasks in MTEB. | nij | Ngaju | Austronesian | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | | nin | Ninzo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | nko | Nkonya | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nld | Dutch | Indo-European | 0 | 1 | 0 | 8 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 30 | 2 | 0 | 0 | 0 | 0 | 4 | 0 | 55 | +| nld | Dutch | Indo-European | 0 | 1 | 0 | 8 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 31 | 2 | 0 | 0 | 0 | 0 | 4 | 0 | 56 | | nlg | Gela | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | nna | Nyangumarta | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | nno | Norwegian Nynorsk | Unclassified | 0 | 0 | 0 | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | @@ -1608,7 +1622,7 @@ The following tables give you an overview of the tasks in MTEB. | nob | Norwegian Bokmål | Unclassified | 0 | 0 | 0 | 4 | 8 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | | noe | Nimadi | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | nop | Numanggang | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nor | Norwegian | Indo-European | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| nor | Norwegian | Indo-European | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | | not | Nomatsiguenga | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | nou | Ewage-Notu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | nov | Novial | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1669,9 +1683,9 @@ The following tables give you an overview of the tasks in MTEB. | poe | San Juan Atzingo Popoloca | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | poh | Poqomchi' | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | poi | Highland Popoluca | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pol | Polish | Indo-European | 0 | 1 | 0 | 6 | 11 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | 0 | 19 | 4 | 0 | 0 | 0 | 0 | 2 | 0 | 52 | +| pol | Polish | Indo-European | 0 | 1 | 0 | 6 | 11 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | 0 | 20 | 4 | 0 | 0 | 0 | 0 | 2 | 0 | 53 | | pon | Pohnpeian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| por | Portuguese | Indo-European | 0 | 1 | 0 | 6 | 9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 1 | 6 | 3 | 0 | 0 | 0 | 0 | 2 | 0 | 34 | +| por | Portuguese | Indo-European | 0 | 1 | 0 | 6 | 9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 2 | 7 | 3 | 0 | 0 | 0 | 0 | 2 | 0 | 36 | | poy | Pogolo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | ppo | Folopa | Teberan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | prf | Paranan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1718,7 +1732,7 @@ The following tables give you an overview of the tasks in MTEB. | ruf | Luguru | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | rug | Roviana | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | run | Rundi | Atlantic-Congo | 0 | 0 | 0 | 1 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| rus | Russian | Indo-European | 0 | 2 | 0 | 7 | 18 | 6 | 0 | 1 | 0 | 0 | 0 | 0 | 2 | 4 | 2 | 17 | 4 | 0 | 0 | 0 | 0 | 2 | 0 | 65 | +| rus | Russian | Indo-European | 0 | 2 | 0 | 7 | 18 | 6 | 0 | 1 | 0 | 0 | 0 | 0 | 2 | 4 | 2 | 18 | 4 | 0 | 0 | 0 | 0 | 2 | 0 | 66 | | rwo | Rawa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | sab | Buglere | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | sag | Sango | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | @@ -1757,7 +1771,7 @@ The following tables give you an overview of the tasks in MTEB. | soq | Kanasi | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | sot | Southern Sotho | Atlantic-Congo | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | | soy | Miyobe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spa | Spanish | Indo-European | 0 | 2 | 0 | 6 | 13 | 4 | 0 | 4 | 0 | 0 | 0 | 0 | 1 | 3 | 2 | 15 | 4 | 0 | 0 | 0 | 0 | 4 | 0 | 58 | +| spa | Spanish | Indo-European | 0 | 2 | 0 | 6 | 13 | 4 | 0 | 4 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 16 | 4 | 0 | 0 | 0 | 0 | 4 | 0 | 60 | | spl | Selepet | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | spm | Akukem | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | spp | Supyire Senoufo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1782,7 +1796,7 @@ The following tables give you an overview of the tasks in MTEB. | suz | Sunwar | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | svk | Slovakian Sign Language | Sign Language | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | swa | Swahili (macrolanguage) | Atlantic-Congo | 0 | 1 | 0 | 1 | 7 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 17 | -| swe | Swedish | Indo-European | 0 | 1 | 0 | 6 | 9 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 27 | +| swe | Swedish | Indo-European | 0 | 1 | 0 | 6 | 9 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | | swg | Swabian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | swh | Swahili (individual language) | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | | swp | Suau | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1815,7 +1829,7 @@ The following tables give you an overview of the tasks in MTEB. | tgl | Tagalog | Austronesian | 0 | 0 | 0 | 5 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | | tgo | Sudest | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | tgp | Tangoa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tha | Thai | Tai-Kadai | 0 | 1 | 0 | 6 | 8 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | +| tha | Thai | Tai-Kadai | 0 | 1 | 0 | 6 | 8 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 27 | | tif | Tifal | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | tim | Timbe | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | tir | Tigrinya | Afro-Asiatic | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | @@ -1854,7 +1868,7 @@ The following tables give you an overview of the tasks in MTEB. | tuk | Turkmen | Turkic | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | | tum | Tumbuka | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | | tuo | Tucano | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tur | Turkish | Turkic | 0 | 3 | 0 | 6 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 4 | 2 | 0 | 0 | 0 | 0 | 2 | 0 | 28 | +| tur | Turkish | Turkic | 0 | 3 | 0 | 6 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 5 | 2 | 0 | 0 | 0 | 0 | 2 | 0 | 29 | | tvk | Southeast Ambrym | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | twi | Twi | Unclassified | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | | txq | Tii | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1888,7 +1902,7 @@ The following tables give you an overview of the tasks in MTEB. | vec | Venetian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | | ven | Venda | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | | vid | Vidunda | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| vie | Vietnamese | Austroasiatic | 0 | 2 | 0 | 7 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 25 | +| vie | Vietnamese | Austroasiatic | 0 | 2 | 0 | 7 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | | viv | Iduna | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | vmy | Ayautla Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | waj | Waffa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1964,7 +1978,7 @@ The following tables give you an overview of the tasks in MTEB. | zaw | Mitla Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zca | Coatecas Altas Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zga | Kinga | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zho | Chinese | Unclassified | 0 | 2 | 0 | 4 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 27 | +| zho | Chinese | Unclassified | 0 | 2 | 0 | 4 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 29 | | zia | Zia | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | ziw | Zigula | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zlm | Malay (individual language) | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1983,7 +1997,7 @@ The following tables give you an overview of the tasks in MTEB. | zty | Yatee Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zul | Zulu | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | | zyp | Zyphe Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| Total | None | None | None | 0 | 55 | 49 | 1494 | 846 | 316 | 7 | 41 | 22 | 5 | 0 | 3 | 28 | 92 | 56 | 601 | 88 | 2 | 2 | 6 | 7 | 37 | 24 | +| Total | None | None | None | 0 | 55 | 49 | 1494 | 848 | 316 | 7 | 41 | 22 | 5 | 0 | 3 | 29 | 93 | 65 | 631 | 88 | 2 | 2 | 6 | 7 | 37 | 24 | diff --git a/docs/usage/usage.md b/docs/usage/usage.md index c323982feb..6f5cb1005e 100644 --- a/docs/usage/usage.md +++ b/docs/usage/usage.md @@ -1,7 +1,7 @@ # Usage This usage documentation follows a structure similar first it introduces a simple example of how to evaluate a model in MTEB. -Then introduces model detailed section of defining model, selecting tasks and running the evaluation. Each section contain subsection pertaining to +Then introduces model detailed section of defining model, selecting tasks and running the evaluation. Each section contains subsections pertaining to these. @@ -28,10 +28,10 @@ For instance if we want to run [`"sentence-transformers/all-MiniLM-L6-v2"`](http ```python model_name = "sentence-transformers/all-MiniLM-L6-v2" -# or using SentenceTransformers -model = SentenceTransformers(model_name) # load the model using MTEB model = mteb.get_model(model_name) # will default to SentenceTransformers(model_name) if not implemented in MTEB +# or using SentenceTransformers +model = SentenceTransformers(model_name) # select the desired tasks and evaluate tasks = mteb.get_tasks(tasks=["Banking77Classification"]) @@ -59,7 +59,7 @@ MTEB is not only text evaluating, but also allow you to evaluate image and image > [!NOTE] > Running MTEB on images requires you to install the optional dependencies using `pip install mteb[image]` -To evaluate image embeddings you can follows the same approach for any other task in `mteb`. Simply ensuring that the task contains the modality "image": +To evaluate image embeddings you can follow the same approach for any other task in `mteb`. Simply ensuring that the task contains the modality "image": ```python tasks = mteb.get_tasks(modalities=["image"]) # Only select tasks with image modalities @@ -107,7 +107,7 @@ model = meta.load_model() model = mteb.get_model(model_name) ``` -You can get an overview of on the models available in `mteb` as follows: +You can get an overview of the models available in `mteb` as follows: ```py model_metas = mteb.get_model_metas() @@ -132,7 +132,7 @@ tasks = mteb.get_tasks(tasks=["Banking77Classification"]) results = mteb.evaluate(model, tasks=tasks) ``` -However, we do recommend check in mteb include an implementation of the model before using sentence transformers since some models (e.g. the [multilingual e5 models](https://huggingface.co/collections/intfloat/multilingual-e5-text-embeddings-67b2b8bb9bff40dec9fb3534)) require a prompt and not specifying it may reduce performance. +However, we do recommend checking if mteb includes an implementation of the model before using sentence transformers since some models (e.g. the [multilingual e5 models](https://huggingface.co/collections/intfloat/multilingual-e5-text-embeddings-67b2b8bb9bff40dec9fb3534)) require a prompt and not specifying it may reduce performance. > [!NOTE] > If you want to evaluate a cross encoder on a reranking task, see section on [running cross encoders for reranking](#running-cross-encoders-on-reranking) @@ -141,7 +141,7 @@ However, we do recommend check in mteb include an implementation of the model be It is also possible to implement your own custom model in MTEB as long as it adheres to the [encoder interface](https://github.com/embeddings-benchmark/mteb/blob/main/mteb/encoder_interface.py#L21). -This entails implementing an `encode` function taking as inputs a list of sentences, and returning a list of embeddings (embeddings can be `np.array`, `torch.tensor`, etc.). +This entails implementing an `encode` function taking as input a list of sentences, and returning a list of embeddings (embeddings can be `np.array`, `torch.tensor`, etc.). ```python import mteb @@ -181,7 +181,7 @@ If you want to submit your implementation to be included in the leaderboard see ## Selecting Tasks -This section describes how to select benchmarks and task to evaluate, including selecting specific subsets or splits to run. +This section describes how to select benchmarks and tasks to evaluate, including selecting specific subsets or splits to run. ### Selecting a Benchmark @@ -197,7 +197,7 @@ results = mteb.evaluate(model, tasks=benchmark) The benchmark specified not only a list of tasks, but also what splits and language to run on. -To get an overview of all available benchmarks simply run: +To get an overview of all available benchmarks, simply run: ```python import mteb @@ -218,7 +218,7 @@ benchmark.citation ### Task selection -`mteb` comes the utility function `mteb.get_task` and `mteb_get_tasks` for fetching and analysing the tasks of interest. +`mteb` comes with the utility function `mteb.get_task` and `mteb_get_tasks` for fetching and analysing the tasks of interest. This can be done in multiple ways, e.g.: @@ -296,7 +296,7 @@ results = mteb.evaluate(model, tasks=[MyCustomTask()]) ## Running the Evaluation -This section contain documentation related to the runtime of the evalution. How to pass arguments to the encoder, saving outputs and similar. +This section contains documentation related to the runtime of the evaluation. How to pass arguments to the encoder, saving outputs and similar. ### Introduction to `mteb.evaluate()` @@ -307,7 +307,6 @@ Evalauting models in `mteb` typically takes the simple form: results = mteb.evaluate(model, tasks=tasks) ``` - ### Specifying the cache By default `mteb` with save the results in cache folder located at `~/.cache/mteb`, however if you want to saving the results in a specific folder you @@ -360,7 +359,7 @@ In prompts the key can be: 8. `STS` 9. `Summarization` 10. `InstructionRetrieval` -3. Pair of task type and prompt type like `Retrival-query` - these prompts will be used in all classification tasks +3. Pair of task type and prompt type like `Retrieval-query` - these prompts will be used in all Retrieval tasks 4. Task name - these prompts will be used in the specific task 5. Pair of task name and prompt type like `NFCorpus-query` - these prompts will be used in the specific task diff --git a/mteb/abstasks/AbsTaskReranking.py b/mteb/abstasks/AbsTaskReranking.py index 3a8f0834df..c600e3cf4d 100644 --- a/mteb/abstasks/AbsTaskReranking.py +++ b/mteb/abstasks/AbsTaskReranking.py @@ -10,7 +10,7 @@ logger = logging.getLogger(__name__) -OLD_FORMAT_RERANKING_TASKS = [] +OLD_FORMAT_RERANKING_TASKS = ["JQaRAReranking", "JaCWIRReranking", "XGlueWPRReranking"] class AbsTaskReranking(AbsTaskRetrieval): diff --git a/mteb/abstasks/task_metadata.py b/mteb/abstasks/task_metadata.py index f4c468c035..92472515e6 100644 --- a/mteb/abstasks/task_metadata.py +++ b/mteb/abstasks/task_metadata.py @@ -195,6 +195,7 @@ class MetadataDatasetDict(TypedDict, total=False): name: str split: str trust_remote_code: bool + dataset_version: str # NLPJournalAbsArticleRetrieval.V2 class TaskMetadata(BaseModel): diff --git a/mteb/descriptive_stats/Retrieval/NLPJournalAbsIntroRetrieval.V2.json b/mteb/descriptive_stats/Retrieval/NLPJournalAbsIntroRetrieval.V2.json new file mode 100644 index 0000000000..c7c6eef43f --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/NLPJournalAbsIntroRetrieval.V2.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 1147, + "number_of_characters": 1607635, + "num_documents": 637, + "min_document_length": 304, + "average_document_length": 2148.0376766091053, + "max_document_length": 9565, + "unique_documents": 637, + "num_queries": 510, + "min_query_length": 18, + "average_query_length": 469.2843137254902, + "max_query_length": 1290, + "unique_queries": 510, + "none_queries": 0, + "num_relevant_docs": 510, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 510, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} diff --git a/mteb/descriptive_stats/Retrieval/NLPJournalTitleAbsRetrieval.V2.json b/mteb/descriptive_stats/Retrieval/NLPJournalTitleAbsRetrieval.V2.json new file mode 100644 index 0000000000..2fde056730 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/NLPJournalTitleAbsRetrieval.V2.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 1147, + "number_of_characters": 308305, + "num_documents": 637, + "min_document_length": 18, + "average_document_length": 461.51962323390893, + "max_document_length": 1290, + "unique_documents": 637, + "num_queries": 510, + "min_query_length": 5, + "average_query_length": 28.072549019607845, + "max_query_length": 71, + "unique_queries": 510, + "none_queries": 0, + "num_relevant_docs": 510, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 510, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} diff --git a/mteb/descriptive_stats/Retrieval/NLPJournalTitleIntroRetrieval.V2.json b/mteb/descriptive_stats/Retrieval/NLPJournalTitleIntroRetrieval.V2.json new file mode 100644 index 0000000000..ad85bd9a93 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/NLPJournalTitleIntroRetrieval.V2.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 1147, + "number_of_characters": 1382617, + "num_documents": 637, + "min_document_length": 304, + "average_document_length": 2148.0376766091053, + "max_document_length": 9565, + "unique_documents": 637, + "num_queries": 510, + "min_query_length": 5, + "average_query_length": 28.072549019607845, + "max_query_length": 71, + "unique_queries": 510, + "none_queries": 0, + "num_relevant_docs": 510, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 510, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index aca21a64c9..bd48f9c202 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -13,7 +13,6 @@ import cachetools import gradio as gr import pandas as pd -from gradio_rangeslider import RangeSlider import mteb from mteb.abstasks.task_metadata import TaskDomain, TaskType @@ -155,10 +154,10 @@ def filter_models( availability: bool | None, compatibility: list[str], instructions: bool | None, - model_size: tuple[int | None, int | None], + max_model_size: int, zero_shot_setting: Literal["only_zero_shot", "allow_all", "remove_unknown"], ): - lower, upper = model_size + lower, upper = 0, max_model_size # Setting to None, when the user doesn't specify anything if (lower == MIN_MODEL_SIZE) or (lower is None): lower = None @@ -176,6 +175,7 @@ def filter_models( frameworks=compatibility, n_parameters_range=(lower, upper), ) + models_to_keep = set() for model_meta in model_metas: is_model_zero_shot = model_meta.is_zero_shot_on(task_select) @@ -214,7 +214,7 @@ def get_leaderboard_app() -> gr.Blocks: availability=None, compatibility=[], instructions=None, - model_size=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), + max_model_size=MAX_MODEL_SIZE, zero_shot_setting="allow_all", ) @@ -376,11 +376,19 @@ def get_leaderboard_app() -> gr.Blocks: label="Zero-shot", interactive=True, ) - model_size = RangeSlider( - minimum=MIN_MODEL_SIZE, - maximum=MAX_MODEL_SIZE, - value=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), - label="Model Size (#M Parameters)", + + max_model_size = gr.Radio( + [ + ("<100M", 100), + ("<500M", 500), + ("<1B", 1000), + ("<5B", 5000), + ("<10B", 10000), + (">10B", MAX_MODEL_SIZE), + ], + value=MAX_MODEL_SIZE, + label="Model Parameters", + interactive=True, ) with gr.Tab("Summary"): @@ -578,7 +586,7 @@ def update_task_list( availability, compatibility, instructions, - model_size, + max_model_size, zero_shot: hash( ( id(scores), @@ -586,7 +594,7 @@ def update_task_list( hash(availability), hash(tuple(compatibility)), hash(instructions), - hash(model_size), + hash(max_model_size), hash(zero_shot), ) ), @@ -597,7 +605,7 @@ def update_models( availability: bool | None, compatibility: list[str], instructions: bool | None, - model_size: tuple[int, int], + max_model_size: int, zero_shot: Literal["allow_all", "remove_unknown", "only_zero_shot"], ): start_time = time.time() @@ -608,7 +616,7 @@ def update_models( availability, compatibility, instructions, - model_size, + max_model_size, zero_shot_setting=zero_shot, ) elapsed = time.time() - start_time @@ -626,7 +634,7 @@ def update_models( availability, compatibility, instructions, - model_size, + max_model_size, zero_shot, ], outputs=[models], @@ -639,7 +647,7 @@ def update_models( availability, compatibility, instructions, - model_size, + max_model_size, zero_shot, ], outputs=[models], @@ -652,7 +660,7 @@ def update_models( availability, compatibility, instructions, - model_size, + max_model_size, zero_shot, ], outputs=[models], @@ -665,7 +673,7 @@ def update_models( availability, compatibility, instructions, - model_size, + max_model_size, zero_shot, ], outputs=[models], @@ -678,12 +686,12 @@ def update_models( availability, compatibility, instructions, - model_size, + max_model_size, zero_shot, ], outputs=[models], ) - model_size.change( + max_model_size.change( update_models, inputs=[ scores, @@ -691,7 +699,7 @@ def update_models( availability, compatibility, instructions, - model_size, + max_model_size, zero_shot, ], outputs=[models], @@ -704,7 +712,7 @@ def update_models( availability, compatibility, instructions, - model_size, + max_model_size, zero_shot, ], outputs=[models], @@ -782,7 +790,7 @@ def update_tables( availability=None, compatibility=[], instructions=None, - model_size=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), + max_model_size=MAX_MODEL_SIZE, zero_shot="allow_all", ) # We have to call this both on the filtered and unfiltered task because the callbacks diff --git a/mteb/models/abs_encoder.py b/mteb/models/abs_encoder.py index 27a9b52faf..63fd3d0afa 100644 --- a/mteb/models/abs_encoder.py +++ b/mteb/models/abs_encoder.py @@ -31,6 +31,7 @@ class AbsEncoder(ABC): mteb_model_meta: ModelMeta | None = None model_prompts: dict[str, str] | None = None instruction_template: str | Callable[[str, PromptType], str] | None = None + prompts_dict: dict[str, str] | None = None def similarity(self, embeddings1: Array, embeddings2: Array) -> Array: if self.mteb_model_meta is None or ( @@ -242,18 +243,26 @@ def validate_task_to_prompt_name(self) -> None: raise KeyError(msg) def get_instruction( - self, task_metadata: TaskMetadata, prompt_type: PromptType | None + self, + task_metadata: TaskMetadata, + prompt_type: PromptType | None, ) -> str: """Get the instruction/prompt to be used for encoding sentences.""" - if isinstance(task_metadata.prompt, dict) and prompt_type: - if task_metadata.prompt.get(prompt_type.value): - return task_metadata.prompt[prompt_type.value] + prompt = task_metadata.prompt + if self.prompts_dict and task_metadata.name in self.prompts_dict: + prompt = self.prompts_dict[task_metadata.name] + + if isinstance(prompt, dict) and prompt_type: + if prompt.get(prompt_type.value): + return prompt[prompt_type.value] logger.warning( f"Prompt type '{prompt_type}' not found in task metadata for task '{task_metadata.name}'." ) return "" - if task_metadata.prompt: - return task_metadata.prompt + + if prompt: + return prompt + abstask = mteb.get_task(task_name=task_metadata.name) return abstask.abstask_prompt @@ -273,10 +282,12 @@ def format_instruction( return self.instruction_template(instruction, prompt_type) def get_task_instruction( - self, task_metadata: TaskMetadata, prompt_type: PromptType | None + self, + task_metadata: TaskMetadata, + prompt_type: PromptType | None, ) -> str: instruction = self.get_instruction(task_metadata, prompt_type) - if self.instruction_template: + if self.instruction_template and len(instruction) > 0: return self.format_instruction(instruction) return instruction diff --git a/mteb/models/instruct_wrapper.py b/mteb/models/instruct_wrapper.py index 98523d0f8b..9334d7cd33 100644 --- a/mteb/models/instruct_wrapper.py +++ b/mteb/models/instruct_wrapper.py @@ -94,6 +94,7 @@ def __init__( apply_instruction_to_passages: bool = True, padding_side: str | None = None, add_eos_token: bool = False, + prompts_dict: dict[str, str] | None = None, **kwargs: Any, ): """Instruct Sentence Transformer Wrapper. Wrapper that passes instructions to the Sentence Transformer model. @@ -107,6 +108,7 @@ def __init__( apply_instruction_to_passages: Whether to apply the instruction template to the passages. padding_side: Padding side. If None, the padding side will be read from the model config. add_eos_token: Whether to add the eos token to each input example. + prompts_dict: Dictionary of task names to prompt names. If None, the prompts will be read from the model config. **kwargs: Kwargs for Sentence Transformer model. """ if ( @@ -126,6 +128,7 @@ def __init__( self.model = SentenceTransformer(model_name, revision=revision, **kwargs) self.apply_instruction_to_passages = apply_instruction_to_passages self.add_eos_token = add_eos_token + self.prompts_dict = prompts_dict if max_seq_length is not None: self.model.max_seq_length = max_seq_length if padding_side is not None: diff --git a/mteb/models/model_implementations/chain19_models.py b/mteb/models/model_implementations/chain19_models.py index c71444804d..487b06b488 100644 --- a/mteb/models/model_implementations/chain19_models.py +++ b/mteb/models/model_implementations/chain19_models.py @@ -1,18 +1,12 @@ from __future__ import annotations -from functools import partial - from mteb.models.model_meta import ModelMeta from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader # NOTE: This model is intentionally not imported to ensure that it does not appear on the leaderboard # see more here: https://github.com/embeddings-benchmark/mteb/issues/2698 chain19_en = ModelMeta( - loader=partial( - sentence_transformers_loader, - model_name="bchoiced/CHAIN19", - revision="5ba01fcb4e90ede5e2772b8a9ca68c12515dc6af", - ), + loader=sentence_transformers_loader, name="bchoiced/CHAIN19", languages=[ "eng-Latn", diff --git a/mteb/models/model_implementations/conan_models.py b/mteb/models/model_implementations/conan_models.py index bbc7148f3b..3a75931e21 100644 --- a/mteb/models/model_implementations/conan_models.py +++ b/mteb/models/model_implementations/conan_models.py @@ -7,14 +7,16 @@ import random import string import time -from functools import partial from typing import Any import numpy as np import requests +from torch.utils.data import DataLoader +from mteb.abstasks.task_metadata import TaskMetadata from mteb.models.abs_encoder import AbsEncoder from mteb.models.model_meta import ModelMeta +from mteb.types import Array, BatchedInput, PromptType from .bge_models import bge_full_data from .e5_instruct import E5_MISTRAL_TRAINING_DATA @@ -153,6 +155,8 @@ class ConanWrapper(AbsEncoder): def __init__( self, model_name: str, + revision: str | None = None, + api_model_name: str | None = None, **kwargs, ) -> None: AK = os.getenv("CONAN_AK") @@ -161,14 +165,20 @@ def __init__( raise ValueError("CONAN_AK and CONAN_SK environment variables must be set") self.client = Client(ak=AK, sk=SK, url="https://ai.om.qq.com/api/conan/v2") - self.model_name = model_name + self.model_name = api_model_name def encode( self, - sentences: list[str], + inputs: DataLoader[BatchedInput], + *, + task_metadata: TaskMetadata, + hf_split: str, + hf_subset: str, + prompt_type: PromptType | None = None, **kwargs: Any, - ) -> np.ndarray: + ) -> Array: embeddings = [] + sentences = [text for batch in inputs for text in batch["text"]] for sentence in sentences: try: @@ -191,9 +201,9 @@ def encode( "eng-Latn", "zho-Hans", ], - loader=partial( # type: ignore - ConanWrapper, - model_name="Conan-embedding-v2", + loader=ConanWrapper, + loader_kwargs=dict( + api_model_name="Conan-embedding-v2", ), max_tokens=32768, embed_dim=3584, diff --git a/mteb/models/model_implementations/fa_models.py b/mteb/models/model_implementations/fa_models.py index 846f266f49..e2ee880f7d 100644 --- a/mteb/models/model_implementations/fa_models.py +++ b/mteb/models/model_implementations/fa_models.py @@ -174,3 +174,45 @@ # https://huggingface.co/datasets/sbunlp/hmblogs-v3 }, ) + +tooka_sbert_v2_small = ModelMeta( + loader=sentence_transformers_loader, + name="PartAI/Tooka-SBERT-V2-Small", + languages=["fas-Arab"], + open_weights=True, + revision="8bbed87e36669387f71437c061430ba56d1b496f", + release_date="2025-05-01", + n_parameters=122_905_344, + memory_usage_mb=496, + embed_dim=768, + license="not specified", + max_tokens=512, + reference="https://huggingface.co/PartAI/Tooka-SBERT-V2-Small", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, +) + +tooka_sbert_v2_large = ModelMeta( + loader=sentence_transformers_loader, + name="PartAI/Tooka-SBERT-V2-Large", + languages=["fas-Arab"], + open_weights=True, + revision="b59682efa961122cc0e4408296d5852870c82eae", + release_date="2025-05-01", + n_parameters=353_039_360, + memory_usage_mb=1347, + embed_dim=1024, + license="not specified", + max_tokens=512, + reference="https://huggingface.co/PartAI/Tooka-SBERT-V2-Large", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, +) diff --git a/mteb/models/model_implementations/geogpt_models.py b/mteb/models/model_implementations/geogpt_models.py index c199aec89e..bd5c9d9b06 100644 --- a/mteb/models/model_implementations/geogpt_models.py +++ b/mteb/models/model_implementations/geogpt_models.py @@ -2,11 +2,9 @@ from __future__ import annotations -from functools import partial - import torch -from mteb.models.instruct_wrapper import instruct_wrapper +from mteb.models.instruct_wrapper import InstructSentenceTransformerModel from mteb.models.model_meta import ModelMeta geoembedding = ModelMeta( @@ -14,10 +12,8 @@ languages=["eng-Latn"], open_weights=True, revision="29803c28ea7ef6871194a8ebc85ad7bfe174928e", - loader=partial( - instruct_wrapper, - "GeoGPT-Research-Project/GeoEmbedding", - "29803c28ea7ef6871194a8ebc85ad7bfe174928e", + loader=InstructSentenceTransformerModel, + loader_kwargs=dict( instruction_template="Instruct: {instruction}\nQuery: ", apply_instruction_to_passages=False, model_kwargs={"torch_dtype": torch.bfloat16}, diff --git a/mteb/models/model_implementations/hinvec_models.py b/mteb/models/model_implementations/hinvec_models.py new file mode 100644 index 0000000000..daac069ca1 --- /dev/null +++ b/mteb/models/model_implementations/hinvec_models.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import logging + +from mteb.models.model_meta import ModelMeta +from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader +from mteb.types import PromptType + +logger = logging.getLogger(__name__) + + +def instruction_template( + instruction: str, prompt_type: PromptType | None = None +) -> str: + return f"Instruct: {instruction}\nQuery: " if instruction else "" + + +hinvec_training_datasets = { + "MintakaRetrieval": ["train"], + "HindiDiscourseClassification": ["train"], + "SentimentAnalysisHindi": ["train"], + "MassiveScenarioClassification": ["train"], + "MTOPIntentClassification": ["train"], + "LinceMTBitextMining": ["train"], + "PhincBitextMining": ["train"], + "XNLI": ["train"], + "MLQARetrieval": ["validation"], + "FloresBitextMining": ["dev"], + "AmazonReviewsClassification": ["train"], +} + +Hinvec_bidir = ModelMeta( + loader=sentence_transformers_loader, + loader_kwargs=dict( + instruction_template=instruction_template, + trust_remote_code=True, + max_seq_length=2048, + padding_side="left", + add_eos_token=True, + ), + name="Sailesh97/Hinvec", + languages=["eng-Latn", "hin-Deva"], + open_weights=True, + revision="d4fc678720cc1b8c5d18599ce2d9a4d6090c8b6b", + release_date="2025-06-19", + n_parameters=939_591_680, + memory_usage_mb=3715, + embed_dim=2048, + license="cc-by-nc-4.0", + max_tokens=2048, + reference="https://huggingface.co/Sailesh97/Hinvec", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + training_datasets=hinvec_training_datasets, + public_training_code=None, + public_training_data=None, +) diff --git a/mteb/models/model_implementations/jina_models.py b/mteb/models/model_implementations/jina_models.py index 1b7d121c00..46c9cc6944 100644 --- a/mteb/models/model_implementations/jina_models.py +++ b/mteb/models/model_implementations/jina_models.py @@ -1,13 +1,16 @@ from __future__ import annotations import logging +from collections.abc import Sequence from typing import Any +import numpy as np import torch from sentence_transformers import __version__ as st_version from torch.utils.data import DataLoader from mteb.abstasks.task_metadata import TaskMetadata +from mteb.languages import PROGRAMMING_LANGS from mteb.models.model_meta import ModelMeta, ScoringFunction from mteb.models.sentence_transformer_wrapper import SentenceTransformerWrapper from mteb.requires_package import requires_package @@ -120,6 +123,33 @@ "zho-Hans", ] +JinaV4_TRAINING_DATA = { + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "mMARCO-NL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "NQ-NL": ["train"], # translation not trained on + "STS12": ["train"], + "SICK-R": ["train"], + "CodeSearchNetRetrieval": ["train"], + "CodeFeedbackST": ["train"], + "CodeFeedbackMT": ["train"], + "AppsRetrieval": ["train"], + "StackOverflowQA": ["train"], + "CornStack": [], + "VDRMultilingualRetrieval": ["train"], + # from https://huggingface.co/datasets/vidore/colpali_train_set + "DocVQA": ["train"], + "InfoVQA": ["train"], + "TATDQA": ["train"], + "arXivQA": ["train"], + # "other": [], # inhouse dataset including synthetic datasets +} + class JinaWrapper(SentenceTransformerWrapper): """following the hf model card documentation.""" @@ -188,6 +218,137 @@ def encode( return embeddings +class JinaV4Wrapper(SentenceTransformerWrapper): + """following the hf model card documentation.""" + + jina_task_to_prompt = { + "retrieval.query": "Query: ", + "retrieval.passage": "Passage: ", + "text-matching": "Query: ", + } + + def __init__( + self, + model: str, + revision: str | None = None, + model_prompts: dict[str, str] | None = None, + **kwargs, + ) -> None: + requires_package( + self, "flash_attn", model, "pip install 'mteb[flash_attention]'" + ) + requires_package(self, "peft", model, "pip install 'mteb[jina-v4]'") + requires_package(self, "torchvision", model, "pip install 'mteb[jina-v4]'") + import flash_attn # noqa: F401 + import peft # noqa: F401 + import transformers # noqa: F401 + + super().__init__(model, revision, model_prompts, **kwargs) + + def encode( + self, + sentences: Sequence[str], + *, + task_name: TaskMetadata, + prompt_type: PromptType | None = None, + **kwargs: Any, + ) -> np.ndarray: + prompt_name = self.get_prompt_name(task_name, prompt_type) + if prompt_name: + logger.info( + f"Using prompt_name={prompt_name} for task={task_name} prompt_type={prompt_type}" + ) + else: + logger.info( + f"No model prompts found for task={task_name} prompt_type={prompt_type}" + ) + logger.info(f"Encoding {len(sentences)} sentences.") + + # Get Jina-specific parameters + jina_task_name = self.model_prompts.get(prompt_name) if prompt_name else None + jina_prompt = ( + self.jina_task_to_prompt.get(jina_task_name) if jina_task_name else None + ) + + # Override task for programming-related content + jina_task_name = get_programming_task_override(task_name, jina_task_name) + + embeddings = self.model.encode( + sentences, + task=jina_task_name.split(".")[0] if jina_task_name else None, + prompt=jina_prompt, + **kwargs, + ) + + if isinstance(embeddings, torch.Tensor): + # sometimes in kwargs can be return_tensors=True + embeddings = embeddings.cpu().detach().float().numpy() + return embeddings + + +def get_programming_task_override( + task_name: str, current_task_name: str | None +) -> str | None: + """Check if task involves programming content and override with 'code' task if so. + + Args: + task_name: Original task name to check + current_task_name: Current Jina task name + + Returns: + 'code' if programming-related task detected, otherwise current_task_name + """ + # Import here to avoid circular imports + from mteb import get_task + + task = get_task(task_name) + + # Check various indicators for programming content + has_code_language = any(lang.endswith("-Code") for lang in task.metadata.eval_langs) + has_programming_language = any( + lang in PROGRAMMING_LANGS for lang in task.metadata.languages + ) + has_programming_domain = any( + domain == "Programming" for domain in task.metadata.domains + ) + + if has_code_language or has_programming_language or has_programming_domain: + return "code" + + return current_task_name + + +jina_embeddings_v4 = ModelMeta( + loader=JinaV4Wrapper, + loader_kwargs=dict( + trust_remote_code=True, + model_prompts={ + "Retrieval-query": "retrieval.query", + "Retrieval-passage": "retrieval.passage", + "STS": "text-matching", + }, + ), + name="jinaai/jina-embeddings-v4", + languages=XLMR_LANGUAGES, + open_weights=True, + revision="26239889730c735ed7e9a4db9180c8935faf4ba0", + release_date="2025-06-24", # official release date + n_parameters=int(3.8 * 1e9), + memory_usage_mb=7500, + max_tokens=8194, + embed_dim=2048, + license="cc-by-nc-4.0", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + reference="https://huggingface.co/jinaai/jina-embeddings-v4", + public_training_code=None, + public_training_data=None, + training_datasets=JinaV4_TRAINING_DATA, + adapted_from="Qwen/Qwen2.5-VL-3B-Instruct", +) + + jina_embeddings_v3 = ModelMeta( loader=JinaWrapper, # type: ignore loader_kwargs=dict( diff --git a/mteb/models/model_implementations/kalm_models.py b/mteb/models/model_implementations/kalm_models.py new file mode 100644 index 0000000000..975c3389f6 --- /dev/null +++ b/mteb/models/model_implementations/kalm_models.py @@ -0,0 +1,540 @@ +from __future__ import annotations + +import logging +from typing import Any + +import torch +from torch.utils.data import DataLoader + +from mteb.abstasks.task_metadata import TaskMetadata +from mteb.models.instruct_wrapper import InstructSentenceTransformerModel +from mteb.models.model_meta import ModelMeta +from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader +from mteb.types import Array, BatchedInput, PromptType + +logger = logging.getLogger(__name__) + + +class KALMWrapper(InstructSentenceTransformerModel): + def encode( + self, + inputs: DataLoader[BatchedInput], + *, + task_metadata: TaskMetadata, + hf_split: str, + hf_subset: str, + prompt_type: PromptType | None = None, + **kwargs: Any, + ) -> Array: + _inputs = [text for batch in inputs for text in batch["text"]] + + instruction = self.get_task_instruction(task_metadata, prompt_type) + + # to passage prompts won't be applied to passages + if not self.apply_instruction_to_passages and prompt_type == PromptType.passage: + instruction = None + logger.info( + f"No instruction used, because prompt type = {prompt_type.passage}" + ) + + if task_metadata.type in ["STS", "PairClassification", "Summarization"]: + logger.info( + f"No instruction used, because task type = {task_metadata.type}" + ) + instruction = None + + if instruction: + logger.info( + f"Using instruction: '{instruction}' for task: '{task_metadata.name}'" + ) + + embeddings = self.model.encode( + _inputs, + prompt=instruction, + **kwargs, + ) + + if isinstance(embeddings, torch.Tensor): + # sometimes in kwargs can be return_tensors=True + embeddings = embeddings.cpu().detach().float().numpy() + return embeddings + + +kalm_training_data = { + # from technical report + # not in MTEB: + # ExpertQA + # MEDI2BGE + # OpenOrca + # PAQ + # PubMedQA + # SearchQA + # arxiv_qa + # rag-dataset-12000 + # CC-News + # SQuAD 2.0 + # TriviaQA + # WebGPT Comparisons + # MultiNLI + # NLLB + # WikiAnswers + # SimCSE NLI + # SNLI + # Aya Dataset + # eli5 + # ---- + # in MTEB: + "CodeFeedbackMT": ["train"], + "CodeFeedbackST": ["train"], + "ArxivClusteringP2P": ["train"], + "ArxivClusteringS2S": ["train"], + "ArxivClusteringP2P.v2": ["train"], + "TRECCOVID": ["train"], + "DBPedia": ["train"], + "ESCIReranking": ["train"], + "FEVER": ["train"], + "FiQA2018": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "FEVER-NL": ["train"], # translation not trained on + "FiQA2018-NL": ["train"], # translation not trained on + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQA-NL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "MultiLongDocRetrieval": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "mMARCO-NL": ["train"], # translation not trained on + "MSMARCOv2": ["train"], + "NFCorpus": ["train"], + "SciFact": ["train"], + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "NQ-NL": ["train"], # translation not trained on + "YahooAnswersTopicsClassification": ["train"], + "ContractNLIConfidentialityOfAgreementLegalBenchClassification": ["train"], + "ContractNLIExplicitIdentificationLegalBenchClassification": ["train"], + "ContractNLIInclusionOfVerballyConveyedInformationLegalBenchClassification": [ + "train" + ], + "ContractNLILimitedUseLegalBenchClassification": ["train"], + "ContractNLINoLicensingLegalBenchClassification": ["train"], + "ContractNLINoticeOnCompelledDisclosureLegalBenchClassification": ["train"], + "ContractNLIPermissibleAcquirementOfSimilarInformationLegalBenchClassification": [ + "train" + ], + "ContractNLIPermissibleCopyLegalBenchClassification": ["train"], + "ContractNLIPermissibleDevelopmentOfSimilarInformationLegalBenchClassification": [ + "train" + ], + "ContractNLIPermissiblePostAgreementPossessionLegalBenchClassification": ["train"], + "ContractNLIReturnOfConfidentialInformationLegalBenchClassification": ["train"], + "ContractNLISharingWithEmployeesLegalBenchClassification": ["train"], + "ContractNLISharingWithThirdPartiesLegalBenchClassification": ["train"], + "ContractNLISurvivalOfObligationsLegalBenchClassification": ["train"], + "QuoraRetrieval": ["train"], + "NanoQuoraRetrieval": ["train"], + "BiorxivClusteringP2P.v2": ["train"], + "BiorxivClusteringS2S.v2": ["train"], + "MedrxivClusteringP2P.v2": ["train"], + "MedrxivClusteringS2S.v2": ["train"], + "Banking77Classification": ["train"], + "AmazonPolarityClassification": ["train"], + "ImdbClassification": ["train"], + "EmotionClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "MrTidyRetrieval": ["train"], + "PawsXPairClassification": ["train"], + "AmazonReviewsClassification": ["train"], + "AmazonCounterfactualClassification": ["train"], + "MultilingualSentiment": ["train"], + "MassiveIntentClassification": ["train"], + "MassiveScenarioClassification": ["train"], + "MTOPDomainClassification": ["train"], + "MTOPIntentClassification": ["train"], +} + +KaLM_task_prompts = { + "AmazonCounterfactualClassification": "Given an Amazon review, judge whether it is counterfactual.", + "AmazonPolarityClassification": "Classifying Amazon reviews into positive or negative sentiment", + "AmazonReviewsClassification": "Classifying the given Amazon review into its appropriate rating category", + "Banking77Classification": "Given an online banking query, find the corresponding intents", + "EmotionClassification": "Classifying the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise", + "ImdbClassification": "Classifying the sentiment expressed in the given movie review text from the IMDB dataset", + "MassiveIntentClassification": "Given a user utterance as query, find the user intents", + "MassiveScenarioClassification": "Given a user utterance as query, find the user scenarios", + "MTOPDomainClassification": "Classifying the intent domain of the given utterance in task-oriented conversation", + "MTOPIntentClassification": "Classifying the intent of the given utterance in task-oriented conversation", + "ToxicConversationsClassification": "Classifying the given comments as either toxic or not toxic", + "TweetSentimentExtractionClassification": "Classifying the sentiment of a given tweet as either positive, negative, or neutral", + "TNews": "Categorizing the given news title", + "IFlyTek": "Given an App description text, find the appropriate fine-grained category", + "MultilingualSentiment": "Classifying sentiment of the customer review into positive, neutral, or negative", + "JDReview": "Classifying sentiment of the customer review for iPhone into positive or negative", + "OnlineShopping": "Classifying sentiment of the customer review into positive or negative", + "Waimai": "Classify the customer review from a food takeaway platform into positive or negative", + "MasakhaNEWSClassification": "Classifying the category of french news.", + "CBD": "Classifying the sentiment of polish tweet reviews", + "PolEmo2.0-IN": "Classifying the sentiment of in-domain (medicine and hotels) online reviews", + "PolEmo2.0-OUT": "Classifying the sentiment of out-of-domain (products and school) online reviews", + "AllegroReviews": "Classifying the sentiment of reviews from e-commerce marketplace Allegro", + "PAC": 'Classifying the sentence into one of the two types: "BEZPIECZNE_POSTANOWIENIE_UMOWNE" and "KLAUZULA_ABUZYWNA"', + "GeoreviewClassification": "Classifying the sentiment of Russian reviews.", + "HeadlineClassification": "Classifying the topic of Russian headlines.", + "InappropriatenessClassification": "Detecting inappropriate messages on sensitive topics", + "KinopoiskClassification": "Classifying the sentiment of Kinopoisk reviews.", + "RuReviewsClassification": "Classifying the sentiment of Russian product reviews.", + "RuSciBenchGRNTIClassification": "Classifying the topic of Russian scientific papers.", + "RuSciBenchOECDClassification": "Classifying the topic of Russian scientific papers.", + "CEDRClassification": "Classification of sentences by emotions.", + "SensitiveTopicsClassification": "Detecting inappropriate messages on sensitive topics.", + "ArxivClusteringP2P": "Identify the main and secondary category of Arxiv papers based on the titles and abstracts", + "ArxivClusteringS2S": "Identify the main and secondary category of Arxiv papers based on the titles", + "BiorxivClusteringP2P": "Identify the main category of Biorxiv papers based on the titles and abstracts", + "BiorxivClusteringS2S": "Identify the main category of Biorxiv papers based on the titles", + "MedrxivClusteringP2P": "Identify the main category of Medrxiv papers based on the titles and abstracts", + "MedrxivClusteringS2S": "Identify the main category of Medrxiv papers based on the titles", + "RedditClustering": "Identify the topic or theme of Reddit posts based on the titles and posts", + "RedditClusteringP2P": "Identify the topic or theme of Reddit posts based on the titles and posts", + "StackExchangeClustering": "Identify the topic or theme of StackExchange posts based on the given paragraphs", + "StackExchangeClusteringP2P": "Identify the topic or theme of StackExchange posts based on the given paragraphs", + "TwentyNewsgroupsClustering": "Identify the topic or theme of the given news articles", + "CLSClusteringS2S": "Identify the main category of scholar papers based on the titles", + "CLSClusteringP2P": "Identify the main category of scholar papers based on the titles and abstracts", + "ThuNewsClusteringS2S": "Identify the topic or theme of the given news articles based on the titles", + "ThuNewsClusteringP2P": "Identify the topic or theme of the given news articles based on the titles and contents", + "AlloProfClusteringP2P": "Identify the main category of Allo Prof document based on the titles and descriptions", + "AlloProfClusteringS2S": "Identify the main category of Allo Prof document based on the titles", + "HALClusteringS2S": "Identify the main category of academic passage based on the titles and contents", + "MasakhaNEWSClusteringP2P": "Identify the topic or theme of the given news articles based on the titles and contents", + "MasakhaNEWSClusteringS2S": "Identify the topic or theme of the given news articles based on the titles", + "MLSUMClusteringP2P": "Identify the topic or theme of the given articles based on the titles and contents", + "MLSUMClusteringS2S": "Identify the topic or theme of the given articles based on the titles", + "EightTagsClustering": "Identify of headlines from social media posts in Polish into 8 categories: film, history, food, medicine, motorization, work, sport and technology", + "GeoreviewClusteringP2P": "Identify the topic or theme of the Russian reviews.", + "RuSciBenchGRNTIClusteringP2P": "Identify the topic or theme of the Russian articles.", + "RuSciBenchOECDClusteringP2P": "Identify the topic or theme of the Russian articles.", +} + + +KaLM_X_task_prompts = { + "Classification": "classify the query into different classes.", + "MultilabelClassification": "Instruct: classify the query into different classes.", + "Clustering": "classify the query into different classes.", + "Reranking-query": "Given a query, retrieve documents that answer the query.", + "Retrieval-query": "Given a query, retrieve documents that answer the query.", + "InstructionRetrieval-query": "Given a query, retrieve documents that answer the query.", + "AmazonCounterfactualClassification": "Classify a given Amazon customer review text as either counterfactual or not-counterfactual", + "AmazonPolarityClassification": "Classify Amazon reviews into positive or negative sentiment", + "AmazonReviewsClassification": "Classify the given Amazon review into its appropriate rating category", + "Banking77Classification": "Given a online banking query, find the corresponding intents", + "EmotionClassification": "Classify the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise", + "ImdbClassification": "Classify the sentiment expressed in the given movie review text from the IMDB dataset", + "MassiveIntentClassification": "Given a user utterance as query, find the user intents", + "MassiveScenarioClassification": "Given a user utterance as query, find the user scenarios", + "MTOPDomainClassification": "Classify the intent domain of the given utterance in task-oriented conversation", + "MTOPIntentClassification": "Classify the intent of the given utterance in task-oriented conversation", + "ToxicConversationsClassification": "Classify the given comments as either toxic or not toxic", + "TweetSentimentExtractionClassification": "Classify the sentiment of a given tweet as either positive, negative, or neutral", + "TNews": "Classify the fine-grained category of the given news title", + "IFlyTek": "Given an App description text, find the appropriate fine-grained category", + "MultilingualSentiment": "Classify sentiment of the customer review into positive, neutral, or negative", + "JDReview": "Classify the customer review for iPhone on e-commerce platform into positive or negative", + "OnlineShopping": "Classify the customer review for online shopping into positive or negative", + "Waimai": "Classify the customer review from a food takeaway platform into positive or negative", + "ArxivClusteringP2P": "Identify the main and secondary category of Arxiv papers based on the titles and abstracts", + "ArxivClusteringS2S": "Identify the main and secondary category of Arxiv papers based on the titles", + "BiorxivClusteringP2P": "Identify the main category of Biorxiv papers based on the titles and abstracts", + "BiorxivClusteringS2S": "Identify the main category of Biorxiv papers based on the titles", + "MedrxivClusteringP2P": "Identify the main category of Medrxiv papers based on the titles and abstracts", + "MedrxivClusteringS2S": "Identify the main category of Medrxiv papers based on the titles", + "RedditClustering": "Identify the topic or theme of Reddit posts based on the titles", + "RedditClusteringP2P": "Identify the topic or theme of Reddit posts based on the titles and posts", + "StackExchangeClustering": "Identify the topic or theme of StackExchange posts based on the titles", + "StackExchangeClusteringP2P": "Identify the topic or theme of StackExchange posts based on the given paragraphs", + "TwentyNewsgroupsClustering": "Identify the topic or theme of the given news articles", + "CLSClusteringS2S": "Identify the main category of scholar papers based on the titles", + "CLSClusteringP2P": "Identify the main category of scholar papers based on the titles and abstracts", + "ThuNewsClusteringS2S": "Identify the topic or theme of the given news articles based on the titles", + "ThuNewsClusteringP2P": "Identify the topic or theme of the given news articles based on the titles and contents", + "AskUbuntuDupQuestions-query": "Retrieve duplicate questions from AskUbuntu forum", + "MindSmallReranking-query": "Retrieve relevant news articles based on user browsing history", + "SciDocsRR-query": "Given a title of a scientific paper, retrieve the titles of other relevant papers", + "StackOverflowDupQuestions-query": "Retrieve duplicate questions from StackOverflow forum", + "T2Reranking-query": "Given a Chinese search query, retrieve web passages that answer the question", + "MMarcoReranking-query": "Given a Chinese search query, retrieve web passages that answer the question", + "CMedQAv1-reranking-query": "Given a Chinese community medical question, retrieve replies that best answer the question", + "CMedQAv2-reranking-query": "Given a Chinese community medical question, retrieve replies that best answer the question", + "ArguAna-query": "Given a claim, find documents that refute the claim", + "ArguAna-passage": "Given a claim, find documents that refute the claim", + "ClimateFEVER-query": "Given a claim about climate change, retrieve documents that support or refute the claim", + "ClimateFEVERHardNegatives-query": "Given a claim about climate change, retrieve documents that support or refute the claim", + "DBPedia-query": "Given a query, retrieve relevant entity descriptions from DBPedia", + "FEVER-query": "Given a claim, retrieve documents that support or refute the claim", + "FEVERHardNegatives-query": "Given a claim, retrieve documents that support or refute the claim", + "FiQA2018-query": "Given a financial question, retrieve user replies that best answer the question", + "HotpotQA-query": "Given a multi-hop question, retrieve documents that can help answer the question", + "HotpotQAHardNegatives-query": "Given a multi-hop question, retrieve documents that can help answer the question", + "MSMARCO-query": "Given a web search query, retrieve relevant passages that answer the query", + "NFCorpus-query": "Given a question, retrieve relevant documents that best answer the question", + "NQ-query": "Given a question, retrieve Wikipedia passages that answer the question", + "QuoraRetrieval-query": "Given a question, retrieve questions that are semantically equivalent to the given question", + "SCIDOCS-query": "Given a scientific paper title, retrieve paper abstracts that are cited by the given paper", + "SciFact-query": "Given a scientific claim, retrieve documents that support or refute the claim", + "Touche2020-query": "Given a question, retrieve detailed and persuasive arguments that answer the question", + "Touche2020Retrieval.v3-query": "Given a question, retrieve detailed and persuasive arguments that answer the question", + "TRECCOVID-query": "Given a query on COVID-19, retrieve documents that answer the query", + "T2Retrieval-query": "Given a Chinese search query, retrieve web passages that answer the question", + "MMarcoRetrieval-query": "Given a web search query, retrieve relevant passages that answer the query", + "DuRetrieval-query": "Given a Chinese search query, retrieve web passages that answer the question", + "CovidRetrieval-query": "Given a question on COVID-19, retrieve news articles that answer the question", + "CmedqaRetrieval-query": "Given a Chinese community medical question, retrieve replies that best answer the question", + "EcomRetrieval-query": "Given a user query from an e-commerce website, retrieve description sentences of relevant products", + "MedicalRetrieval-query": "Given a medical question, retrieve user replies that best answer the question", + "VideoRetrieval-query": "Given a video search query, retrieve the titles of relevant videos", + "MasakhaNEWSClassification": "Classify the News in the given texts into one of the seven category: politics,sports,health,business,entertainment,technology,religion ", + "AlloProfClusteringP2P": "Identify the main category of Allo Prof document based on the titles and descriptions", + "AlloProfClusteringS2S": "Identify the topic of document titles from Allo Prof dataset", + "HALClusteringS2S": "Identify the main category of academic passage based on the titles and contents", + "MasakhaNEWSClusteringP2P": "Identify the topic or theme of the given news articles based on the titles and contents", + "MasakhaNEWSClusteringS2S": "Identify the topic or theme of the given news articles based on the titles", + "MLSUMClusteringP2P": "Identify the topic or theme of the given articles based on the titles and contents", + "MLSUMClusteringS2S": "Identify the topic or theme of the given articles based on the titles", + "SyntecReranking-query": "Given a question, retrieve passages that answer the question", + "AlloprofReranking-query": "Given a question, retrieve passages that answer the question", + "AlloprofRetrieval-query": "Given a question, retrieve passages that answer the question", + "BSARDRetrieval-query": "Given a question, retrieve passages that answer the question", + "SyntecRetrieval-query": "Given a question, retrieve passages that answer the question", + "XPQARetrieval-query": "Given a question, retrieve passages that answer the question", + "MintakaRetrieval-query": "Given a question, retrieve passages that answer the question", + "CBD": "Classify the sentiment of polish tweet reviews", + "PolEmo2.0-IN": "Classify the sentiment of in-domain (medicine and hotels) online reviews", + "PolEmo2.0-OUT": "Classify the sentiment of out-of-domain (products and school) online reviews", + "AllegroReviews": "Classify the sentiment of reviews from e-commerce marketplace Allegro", + "PAC": 'Classify the sentence into one of the two types: "BEZPIECZNE_POSTANOWIENIE_UMOWNE" and "KLAUZULA_ABUZYWNA"', + "EightTagsClustering": "Identify of headlines from social media posts in Polish into 8 categories: film, history, food, medicine, motorization, work, sport and technology", + "ArguAna-PL-query": "Given a claim, find documents that refute the claim", + "DBPedia-PL-query": "Given a query, retrieve relevant entity descriptions from DBPedia", + "FiQA-PL-query": "Given a financial question, retrieve user replies that best answer the question", + "HotpotQA-PL-query": "Given a multi-hop question, retrieve documents that can help answer the question", + "MSMARCO-PL-query": "Given a web search query, retrieve relevant passages that answer the query", + "NFCorpus-PL-query": "Given a question, retrieve relevant documents that best answer the question", + "NQ-PL-query": "Given a question, retrieve Wikipedia passages that answer the question", + "Quora-PL-query": "Given a question, retrieve questions that are semantically equivalent to the given question", + "SCIDOCS-PL-query": "Given a scientific paper title, retrieve paper abstracts that are cited by the given paper", + "SciFact-PL-query": "Given a scientific claim, retrieve documents that support or refute the claim", + "TRECCOVID-PL-query": "Given a query on COVID-19, retrieve documents that answer the query", + "GeoreviewClassification": "Classify the organization rating based on the reviews", + "HeadlineClassification": "Classify the topic or theme of the given news headline", + "InappropriatenessClassification": "Classify the given message as either sensitive topic or not", + "KinopoiskClassification": "Classify the sentiment expressed in the given movie review text", + "RuReviewsClassification": "Classify product reviews into positive, negative or neutral sentiment", + "RuSciBenchGRNTIClassification": "Classify the category of scientific papers based on the titles and abstracts", + "RuSciBenchOECDClassification": "Classify the category of scientific papers based on the titles and abstracts", + "GeoreviewClusteringP2P": "Identify the organization category based on the reviews", + "RuSciBenchGRNTIClusteringP2P": "Identify the category of scientific papers based on the titles and abstracts", + "RuSciBenchOECDClusteringP2P": "Identify the category of scientific papers based on the titles and abstracts", + "RuBQReranking-query": "Given a question, retrieve Wikipedia passages that answer the question", + "RiaNewsRetrieval-query": "Given a headline, retrieval relevant articles", + "RuBQRetrieval-query": "Given a question, retrieve Wikipedia passages that answer the question", + "AppsRetrieval-query": "Given a question about code problem, retrieval code that can solve user's problem", + "COIRCodeSearchNetRetrieval-query": "Given a code snippet, retrieve the comment corresponding to that code.", + "CodeEditSearchRetrieval-query": "Given a piece of code, retrieval code that in the ", + "CodeFeedbackMT-query": "Given a question about coding, retrieval code or passage that can solve user's question", + "CodeFeedbackST-query": "Given a question about coding, retrieval code or passage that can solve user's question", + "CodeSearchNetCCRetrieval-query": "Given a code comment, retrieve the code snippet corresponding to that comment.", + "CodeSearchNetRetrieval-query": "Given a code snippet, retrieve the comment corresponding to that code.", + "CodeTransOceanContest-query": "Given a piece for code, retrieval semantically similar code", + "CodeTransOceanDL-query": "Given a piece for code, retrieval semantically similar code", + "CosQA-query": "Given a question about coding, retrieval code or passage that can solve user's question", + "StackOverflowQA-query": "Given a question about coding, retrieval code or passage that can solve user's question", + "SyntheticText2SQL-query": "Given a user's question, retrieve SQL queries that are appropriate responses to the question", + "BulgarianStoreReviewSentimentClassfication": "Classify user reviews into positive or negative sentiment", + "CzechProductReviewSentimentClassification": "Classify product reviews into positive or negative sentiment", + "GreekLegalCodeClassification": "Given a greek legal text, classify its topic", + "DBpediaClassification": "Given a Wikipedia articles, categorized it into classes based on its DBpedia ontology", + "FinancialPhrasebankClassification": "Given financial news, categorized by sentiment into positive, negative, or neutral", + "PoemSentimentClassification": "Gvien a poem, categorized by sentiment into positive, no_impact, negative or mixed", + "TweetTopicSingleClassification": "Gvien a twitter, classify its topic", + "EstonianValenceClassification": "Given a news article, categorized by sentiment into negatiivne, positiivne, neutraalne or vastuolulin", + "FilipinoShopeeReviewsClassification": "Given a shop review, classify its rating on a scale from 1 to 5", + "GujaratiNewsClassification": "Given a Gujarati news articles, classify ist topic", + "SentimentAnalysisHindi": "Given a hindi text, categorized by sentiment into positive, negative or neutral", + "IndonesianIdClickbaitClassification": "Given an Indonesian news headlines, classify its into clickbait or non-clickbait", + "ItaCaseholdClassification": "Given a judgments, classify its topic", + "KorSarcasmClassification": "Given a twitter, categorized it into sarcasm or not_sarcasm", + "KurdishSentimentClassification": "Given a text, categorized by sentiment into positive or negative", + "MacedonianTweetSentimentClassification": "Given a Macedonian tweet, categorized by sentiment into positive, negative, or neutral", + "AfriSentiClassification": "Given a text, categorized by sentiment into positive, negative, or neutral", + "CataloniaTweetClassification": "Given a tweet, categorized by sentiment into AGAINST, FAVOR or NEUTRAL", + "CyrillicTurkicLangClassification": "Given a text, classify its language", + "IndicLangClassification": "Given a text, classify its language", + "MultiHateClassification": "Given a text, categorized by sentiment into hate or non-hate", + "NusaParagraphEmotionClassification": "Given a paragraph, classify its emotion", + "NusaX-senti": "Given a text, categorized by sentiment into positive or negative", + "SwissJudgementClassification": "Given a news article, categorized it into approval or dismissal", + "NepaliNewsClassification": "Given a news article, categorized it into business, entertainment or sports", + "OdiaNewsClassification": "Given a news article, categorized it into business, entertainment or sports", + "PunjabiNewsClassification": "Given a news article, categorized it into two-classes", + "SinhalaNewsClassification": "Given a news article, categorized it into political, business, technology, sports and Entertainment", + "CSFDSKMovieReviewSentimentClassification": "Given a movie review, classify its rating on a scale from 0 to 5", + "SiswatiNewsClassification": "Given a news article, classify its topic", + "SlovakMovieReviewSentimentClassification": "Given a movie review, categorized it into positive or negative", + "SwahiliNewsClassification": "Given a news article, classify its domain", + "TswanaNewsClassification": "Given a news article, classify its topic", + "IsiZuluNewsClassification": "Given a news article, classify its topic", + "WikiCitiesClustering": "Identify of Wikipedia articles of cities by country", + "RomaniBibleClustering": "Identify verses from the Bible in Kalderash Romani by book.", + "ArXivHierarchicalClusteringP2P": "Identify the main and secondary category of Arxiv papers based on the titles and abstracts", + "ArXivHierarchicalClusteringS2S": "Identify the main and secondary category of Arxiv papers based on the titles", + "BigPatentClustering.v2": "Identify the category of documents from the Big Patent dataset", + "AlloProfClusteringS2S.v2": "Identify the topic of document titles from Allo Prof dataset", + "HALClusteringS2S.v2": "Identify the topic of titles from HAL", + "SIB200ClusteringS2S": "Identify the category of documents", + "WikiClusteringP2P.v2": "Identify the category of wiki passages", + "PlscClusteringP2P.v2": "Identify the category of titles+abstracts from Library of Science", + "KorHateSpeechMLClassification": "Given a Korean online news comments, classify its fine-grained hate speech classes", + "MalteseNewsClassification": "Given a maltese new, classify its topic", + "MultiEURLEXMultilabelClassification": "Given a text, classify its topic", + "BrazilianToxicTweetsClassification": "Given a tweet, classify its topic", + "AILAStatutes-query": "Identifying the most relevant statutes for a given situation", + "HagridRetrieval-query": "Retrieval the relevant passage for the given query", + "LegalBenchCorporateLobbying-query": "Retrieval the relevant passage for the given query", + "LEMBPasskeyRetrieval-query": "Retrieval the relevant passage for the given query", + "BelebeleRetrieval-query": "Retrieval the relevant passage for the given query", + "MLQARetrieval-query": "Retrieval the relevant passage for the given query", + "StatcanDialogueDatasetRetrieval-query": "Retrieval the relevant passage for the given query", + "WikipediaRetrievalMultilingual-query": "Retrieval the relevant passage for the given query", + "Core17InstructionRetrieval-query": "Retrieval the relevant passage for the given query", + "News21InstructionRetrieval-query": "Retrieval the relevant passage for the given query", + "Robust04InstructionRetrieval-query": "Retrieval the relevant passage for the given query", + "WebLINXCandidatesReranking-query": "Retrieval the relevant passage for the given query", + "WikipediaRerankingMultilingual-query": "Retrieval the relevant passage for the given query", + "MIRACLRetrievalHardNegatives-query": "Retrieval relevant passage for the given query", + "CQADupstackRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackGamingRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackGamingRetrieval-passage": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackUnixRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackUnixRetrieval-passage": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", +} + +KaLM_INSTRUCTION = "Instruct: {instruction} \n Query: " + +HIT_TMG__KaLM_embedding_multilingual_mini_instruct_v1 = ModelMeta( + loader=KALMWrapper, + loader_kwargs=dict( + instruction_template=KaLM_INSTRUCTION, + max_seq_length=512, + apply_instruction_to_passages=False, + prompts_dict=KaLM_task_prompts, + ), + name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", + revision="45e42c89990c40aca042659133fc8b13c28634b5", + release_date="2024-10-23", + languages=["eng-Latn", "zho-Hans"], + n_parameters=494032768, + memory_usage_mb=1885, + max_tokens=512, + embed_dim=896, + license="mit", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", + similarity_fn_name="cosine", + use_instructions=True, + training_datasets=kalm_training_data, # Replace with actual dataset if available + adapted_from="Qwen/Qwen2-0.5B", + superseded_by=None, +) + +HIT_TMG__KaLM_embedding_multilingual_mini_v1 = ModelMeta( + loader=sentence_transformers_loader, + name="HIT-TMG/KaLM-embedding-multilingual-mini-v1", + revision="8a82a0cd2b322b91723e252486f7cce6fd8ac9d3", + release_date="2024-08-27", + languages=["eng-Latn", "zho-Hans"], + n_parameters=494032768, + memory_usage_mb=1885, + max_tokens=512, + embed_dim=896, + license="mit", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-v1", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=kalm_training_data, + adapted_from="Qwen/Qwen2-0.5B", + superseded_by=None, +) + +HIT_TMG__KaLM_embedding_multilingual_mini_instruct_v1_5 = ModelMeta( + loader=KALMWrapper, + loader_kwargs=dict( + instruction_template=KaLM_INSTRUCTION, + max_seq_length=512, + apply_instruction_to_passages=False, + prompts_dict=KaLM_task_prompts, + ), + name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5", + revision="fcff2f8a54e4cd96b7766fef1ee960a43d42bb3c", + release_date="2024-12-26", + languages=["eng-Latn", "zho-Hans"], + n_parameters=494032768, + memory_usage_mb=1885, + max_tokens=512, + embed_dim=896, + license="mit", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5", + similarity_fn_name="cosine", + use_instructions=True, + training_datasets=kalm_training_data, + adapted_from="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", + superseded_by=None, +) + + +# KaLM_Embedding_X_0605 = ModelMeta( +# loader=partial( +# KALMWrapper, +# model_name="KaLM-Team/KaLM-Embedding-X-0605", +# revision="1", +# instruction_template=KaLM_INSTRUCTION, +# max_seq_length=512, +# apply_instruction_to_passages=True, +# prompts_dict=KaLM_X_task_prompts, +# ), +# name="KaLM-Team/KaLM-Embedding-X-0605", +# revision="1", +# languages=None, +# open_weights=False, +# release_date="2025-06-05", +# n_parameters=9.24 * 1e9, +# memory_usage_mb=35254, +# max_tokens=8192, +# embed_dim=3584, +# license=None, +# reference="https://github.com/KaLM-Team/KaLM-Embedding-X", +# similarity_fn_name="cosine", +# framework=["Sentence Transformers", "PyTorch"], +# use_instructions=True, +# public_training_code="https://github.com/HITsz-TMG/KaLM-Embedding", +# public_training_data=None, +# training_datasets=kalm_training_data, +# ) diff --git a/mteb/models/model_implementations/lgai_embedding_models.py b/mteb/models/model_implementations/lgai_embedding_models.py index 016bde7912..81e9457420 100644 --- a/mteb/models/model_implementations/lgai_embedding_models.py +++ b/mteb/models/model_implementations/lgai_embedding_models.py @@ -1,7 +1,5 @@ from __future__ import annotations -from functools import partial - from mteb.models.model_meta import ModelMeta from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader @@ -46,11 +44,7 @@ } lgai_embedding_en = ModelMeta( - loader=partial( - sentence_transformers_loader, - model_name="annamodels/LGAI-Embedding-Preview", - revision="5e0b2316acc8c2e2941ded6b9cb200b1cb313e65", - ), + loader=sentence_transformers_loader, name="annamodels/LGAI-Embedding-Preview", languages=[ "eng-Latn", diff --git a/mteb/models/model_implementations/mcinext_models.py b/mteb/models/model_implementations/mcinext_models.py new file mode 100644 index 0000000000..f93289af6c --- /dev/null +++ b/mteb/models/model_implementations/mcinext_models.py @@ -0,0 +1,496 @@ +from __future__ import annotations + +import logging +import os +import time +from typing import Any + +import numpy as np +import requests + +from mteb.models.abs_encoder import AbsEncoder +from mteb.models.model_meta import ModelMeta +from mteb.types import PromptType + +logger = logging.getLogger(__name__) + +MODEL_API_NAMES = { + "hakim": "Hakim", + "hakim-small": "Hakim_small", + "hakim-unsup": "Hakim_unsuper", +} + +# Dataset task mappings with descriptions and task IDs +DATASET_TASKS = { + "PersianTextEmotion": ("دسته بندی , دسته بندی احساس متن", 1), + "PersianFoodSentimentClassification": ("دسته بندی , تحلیل احساس رضایت متن", 1), + "SentimentDKSF": ("دسته بندی , تحلیل احساس رضایت متن", 1), + "MassiveIntentClassification": ("دسته بندی , دسته بندی موضوعی متن", 1), + "MassiveScenarioClassification": ("دسته بندی , دسته بندی موضوعی متن", 1), + "SynPerChatbotConvSAAnger": ( + "دسته بندی , تحلیل احساس عصبانیت کاربر در مکالمه با چت بات", + 1, + ), + "SynPerChatbotConvSASatisfaction": ( + "دسته بندی , تحلیل احساس رضایت کاربر در مکالمه با چت بات", + 1, + ), + "SynPerChatbotConvSAFriendship": ( + "دسته بندی , تحلیل احساس صمیمیت کاربر در مکالمه با چت بات", + 1, + ), + "SynPerChatbotConvSAFear": ( + "دسته بندی , تحلیل احساس ترس کاربر در مکالمه با چت بات", + 1, + ), + "SynPerChatbotConvSAJealousy": ( + "دسته بندی , تحلیل احساس حسادت کاربر در مکالمه با چت بات", + 1, + ), + "SynPerChatbotConvSASurprise": ( + "دسته بندی , تحلیل احساس شگفتی کاربر در مکالمه با چت بات", + 1, + ), + "SynPerChatbotConvSALove": ( + "دسته بندی , تحلیل احساس عشق کاربر در مکالمه با چت بات", + 1, + ), + "SynPerChatbotConvSASadness": ( + "دسته بندی , تحلیل احساس غصه کاربر در مکالمه با چت بات", + 1, + ), + "SynPerChatbotConvSAHappiness": ( + "دسته بندی , تحلیل احساس خوشحالی کاربر در مکالمه با چت بات", + 1, + ), + "SynPerChatbotConvSAToneChatbotClassification": ( + "دسته بندی , تشخیص لحن چت بات در مکالمه ی کاربر با چت بات", + 1, + ), + "SynPerChatbotConvSAToneUserClassification": ( + "دسته بندی , تشخیص لحن کاربر در مکالمه با چت بات", + 1, + ), + "PersianTextTone": ("دسته بندی , تشخیص لحن متن", 1), + "SynPerChatbotToneUserClassification": ( + "دسته بندی , تشخیص لحن کاربر در مکالمه با چت بات", + 1, + ), + "SynPerChatbotToneChatbotClassification": ( + "دسته بندی , تشخیص لحن چت بات در مکالمه ی کاربر با چت بات", + 1, + ), + "SynPerChatbotRAGToneUserClassification": ( + "دسته بندی , تشخیص لحن کاربر در مکالمه با چت بات", + 1, + ), + "SynPerChatbotRAGToneChatbotClassification": ( + "دسته بندی , تشخیص لحن چت بات در مکالمه ی کاربر با چت بات", + 1, + ), + "SynPerChatbotSatisfactionLevelClassification": ( + "دسته بندی , تحلیل احساس رضایت کاربر در مکالمه با چت بات", + 1, + ), + "DigimagClassification": ("دسته بندی , دسته بندی موضوعی متن", 1), + "NLPTwitterAnalysisClassification": ("دسته بندی , دسته بندی موضوعی متن", 1), + "SIDClassification": ("دسته بندی , دسته بندی موضوعی متن", 1), + "DeepSentiPers": ("دسته بندی , تحلیل احساس رضایت متن", 1), + "DigikalamagClassification": ("دسته بندی , دسته بندی موضوعی متن", 1), + "FarsTail": ("تشخیص ارتباط , آیا متن دوم شباهت معنایی با متن اول دارد ؟", 4), + "ParsinluEntail": ("تشخیص ارتباط , آیا متن دوم شباهت معنایی با متن اول دارد ؟", 4), + "ParsinluQueryParaphPC": ( + "تشخیص ارتباط , آیا متن دوم شباهت معنایی با متن اول دارد ؟", + 4, + ), + "SynPerChatbotRAGFAQPC": ( + "تشخیص ارتباط , متن اول مکالمه ی کاربر با چت بات است. آیا متن دوم خلاصه ی متن اول است ؟", + 4, + ), + "SynPerTextKeywordsPC": ("تشخیص ارتباط , آیا متن دوم پاسخ متن اول است ؟", 4), + "SynPerQAPC": ("تشخیص ارتباط , آیا متن دوم به متن اول مرتبط است ؟", 4), + "CExaPPC": ("تشخیص ارتباط , آیا متن دوم شباهت معنایی با متن اول دارد ؟", 4), + "FarsiParaphraseDetection": ( + "تشخیص ارتباط , آیا متن دوم شباهت معنایی با متن اول دارد ؟", + 4, + ), + "Farsick": ("تشخیص ارتباط , آیا متن دوم شباهت معنایی با متن اول دارد ؟", 3), + "Query2Query": ("تشخیص ارتباط , آیا متن دوم شباهت معنایی با متن اول دارد ؟", 3), + "SynPerSTS": ("تشخیص ارتباط , آیا متن دوم شباهت معنایی با متن اول دارد ؟", 3), + "BeytooteClustering": ("دسته بندی , دسته بندی موضوعی متن", 1), + "DigikalamagClustering": ("دسته بندی , دسته بندی موضوعی متن", 1), + "NLPTwitterAnalysisClustering": ("دسته بندی , دسته بندی موضوعی متن", 1), + "HamshahriClustring": ("دسته بندی , دسته بندی موضوعی متن", 1), + "SIDClustring": ("دسته بندی , دسته بندی موضوعی متن", 1), + "MIRACLReranking": ("تشخیص ارتباط , آیا متن دوم پاسخ متن اول است ؟", 3), + "WikipediaRerankingMultilingual": ( + "تشخیص ارتباط , آیا متن دوم پاسخ متن اول است ؟", + 3, + ), + "SAMSumFa": ( + "تشخیص ارتباط , متن اول یک مکالمه است. آیا متن دوم خلاصه ی متن اول است ؟", + 3, + ), + "SynPerChatbotSumSRetrieval": ( + "تشخیص ارتباط , متن اول مکالمه ی کاربر با چت بات است. آیا متن دوم خلاصه ی متن اول است ؟", + 3, + ), + "SynPerChatbotRAGSumSRetrieval": ( + "تشخیص ارتباط , متن اول مکالمه ی کاربر با چت بات است. آیا متن دوم خلاصه ی متن اول است ؟", + 3, + ), + "SynPerQARetrieval": ("تشخیص ارتباط , آیا متن دوم پاسخ متن اول است ؟", 3), + "SynPerChatbotTopicsRetrieval": ( + "تشخیص ارتباط , متن اول مکالمه ی کاربر با چت بات است. آیا متن دوم موضوع استخراج شده ی متن اول است ؟", + 3, + ), + "SynPerChatbotRAGTopicsRetrieval": ( + "تشخیص ارتباط , متن اول مکالمه ی کاربر با چت بات است. آیا متن دوم موضوع استخراج شده ی متن اول است ؟", + 3, + ), + "SynPerChatbotRAGFAQRetrieval": ( + "تشخیص ارتباط , آیا متن دوم به متن اول مرتبط است ؟", + 3, + ), + "PersianWebDocumentRetrieval": ( + "تشخیص ارتباط , آیا متن دوم به متن اول مرتبط است ؟", + 3, + ), +} + +# Add all retrieval datasets with the same instruction and task ID +RETRIEVAL_DATASETS = [ + "ArguAna-Fa", + "ClimateFEVER-Fa", + "CQADupstackAndroidRetrieval-Fa", + "CQADupstackEnglishRetrieval-Fa", + "CQADupstackGamingRetrieval-Fa", + "CQADupstackGisRetrieval-Fa", + "CQADupstackMathematicaRetrieval-Fa", + "CQADupstackPhysicsRetrieval-Fa", + "CQADupstackProgrammersRetrieval-Fa", + "CQADupstackStatsRetrieval-Fa", + "CQADupstackTexRetrieval-Fa", + "CQADupstackUnixRetrieval-Fa", + "CQADupstackWebmastersRetrieval-Fa", + "CQADupstackWordpressRetrieval-Fa", + "DBPedia-Fa", + "FiQA2018-Fa", + "HotpotQA-Fa", + "MSMARCO-Fa", + "NFCorpus-Fa", + "NQ-Fa", + "QuoraRetrieval-Fa", + "SCIDOCS-Fa", + "SciFact-Fa", + "TRECCOVID-Fa", + "Touche2020-Fa", + "MIRACLRetrieval", + "WikipediaRetrievalMultilingual", +] + +for dataset in RETRIEVAL_DATASETS: + DATASET_TASKS[dataset] = ("تشخیص ارتباط , آیا متن دوم به متن اول مرتبط است ؟", 3) + + +class APIError(Exception): + """Custom exception for API errors.""" + + def __init__(self, message: str, status_code: int | None = None): + super().__init__( + f"API Error: {message} (Status Code: {status_code})" + if status_code + else f"API Error: {message}" + ) + self.status_code = status_code + + +class HakimModelWrapper(AbsEncoder): + """A simplified wrapper for the Hakim instruction-following model.""" + + def __init__( + self, + model_name: str, + revision: str, + api_model_name: str, + max_retries: int = 3, + retry_delay: int = 10, + **kwargs: Any, + ): + self.model_name = api_model_name + self.api_url = f"https://mcinext.ai/api/{api_model_name}" + self.max_retries = max_retries + self.retry_delay = retry_delay + self.api_key = os.getenv("MCINEXT_API_KEY") + if not self.api_key: + raise ValueError("MCINEXT_API_KEY environment variable not set.") + self.headers = { + "Content-Type": "application/json", + "Accept": "application/json", + "Authorization": f"Bearer {self.api_key}", + } + logger.info(f"Initialized model wrapper for: {api_model_name}") + + def _preprocess_sample( + self, + sample: str, + task_name: str, + prompt_type: PromptType | None, + sub: str | None, + ) -> str: + """Preprocesses a single text sample based on the task.""" + if "unsup" in self.model_name: + return sample + + task_prompt, task_id = DATASET_TASKS.get(task_name, (None, None)) + + if not task_prompt: + logger.warning(f"Unknown dataset: {task_name}, no preprocessing applied.") + return sample + + task_prompt = f"مسئله : {task_prompt}" + + if task_id == 1: + return f"{task_prompt} | متن : {sample}" + if task_id == 3: + if sub == "sentence1" or (prompt_type and prompt_type.value == "query"): + return f"{task_prompt} | متن اول : {sample}" + if sub == "sentence2" or (prompt_type and prompt_type.value == "passage"): + return f"{task_prompt} | متن دوم : {sample}" + return sample + + def _make_api_request(self, data: dict[str, Any]) -> list[list[float]]: + """Makes an API request with retry logic.""" + for attempt in range(self.max_retries): + try: + response = requests.post( + self.api_url, headers=self.headers, json=data, timeout=60 + ) + response.raise_for_status() + response_data = response.json() + + if not response_data.get("data") or not all( + "embedding" in item for item in response_data["data"] + ): + raise APIError("Invalid response format from API.") + + return [item["embedding"] for item in response_data["data"]] + + except requests.exceptions.RequestException as e: + status_code = e.response.status_code if e.response else None + logger.warning( + f"API request failed (attempt {attempt + 1}/{self.max_retries}): {e}" + ) + if status_code and 400 <= status_code < 500 and status_code != 429: + raise APIError(f"Client error: {e}", status_code) + time.sleep(self.retry_delay * (2**attempt)) + + raise APIError(f"API request failed after {self.max_retries} attempts.") + + def encode( + self, + sentences: list[str], + *, + task_name: str, + prompt_type: PromptType | None = None, + batch_size: int = 32, + **kwargs: Any, + ) -> np.ndarray: + """Encodes sentences using the API.""" + if not sentences or not all(isinstance(s, str) for s in sentences): + raise ValueError("Input must be a non-empty list of strings.") + + logger.info( + f"Starting encoding for {len(sentences)} sentences, task: {task_name}, batch_size: {batch_size}" + ) + + sub = kwargs.get("sub") + processed_sentences = [ + self._preprocess_sample(s, task_name, prompt_type, sub) for s in sentences + ] + + all_embeddings = [] + for i in range(0, len(processed_sentences), batch_size): + batch = processed_sentences[i : i + batch_size] + data = { + "model": MODEL_API_NAMES[self.model_name], + "input": batch, + "encoding_format": "float", + "add_special_tokens": True, + } + try: + batch_embeddings = self._make_api_request(data) + if len(batch_embeddings) != len(batch): + raise APIError( + f"Embedding count mismatch: expected {len(batch)}, got {len(batch_embeddings)}" + ) + all_embeddings.extend(batch_embeddings) + except APIError as e: + logger.error(f"Failed to process batch starting at index {i}: {e}") + raise e + + logger.info( + f"Encoding completed successfully for {len(all_embeddings)} sentences." + ) + return np.array(all_embeddings, dtype=np.float32) + + +hakim = ModelMeta( + loader=HakimModelWrapper, + loader_kwargs=dict( + api_model_name="hakim", + ), + name="MCINext/Hakim", + languages=["fas-Arab"], + open_weights=False, + revision="1", + release_date="2025-05-10", + n_parameters=124_441_344, + memory_usage_mb=475, + embed_dim=768, + license="not specified", + max_tokens=512, + reference="https://huggingface.co/MCINext/Hakim", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets={ + "FarsTail": [], + "SAMSumFa": ["train"], + "SynPerChatbotSumSRetrieval": ["train"], + "SynPerChatbotRAGSumSRetrieval": ["train"], + "SynPerChatbotConvSAClassification": ["train"], + "SynPerChatbotConvSAToneChatbotClassification": ["train"], + "SynPerChatbotConvSAToneUserClassification": ["train"], + "SynPerChatbotSatisfactionLevelClassification": ["train"], + "SynPerChatbotRAGToneChatbotClassification": ["train"], + "SynPerChatbotRAGToneUserClassification": ["train"], + "SynPerChatbotToneChatbotClassification": ["train"], + "SynPerChatbotToneUserClassification": ["train"], + "SynPerTextToneClassification": ["train"], + "SIDClassification": ["train"], + "PersianTextEmotion": ["train"], + "SentimentDKSF": ["train"], + "NLPTwitterAnalysisClassification": ["train"], + "DigikalamagClassification": ["train"], + "DigikalamagClustering": ["train"], + "NLPTwitterAnalysisClustering": ["train"], + "SIDClustring": ["train"], + "CExaPPC": ["train"], + "SynPerChatbotRAGFAQPC": ["train"], + "FarsiParaphraseDetection": ["train"], + "SynPerTextKeywordsPC": ["train"], + "SynPerQAPC": ["train"], + "ParsinluEntail": ["train"], + "ParsinluQueryParaphPC": ["train"], + "FiQA2018-Fa": ["train"], + "HotpotQA-Fa": ["train"], + "MSMARCO-Fa": ["train"], + "NFCorpus-Fa": ["train"], + "SciFact-Fa": ["train"], + "SynPerQARetrieval": ["train"], + "SynPerChatbotTopicsRetrieval": ["train"], + "SynPerChatbotRAGTopicsRetrieval": ["train"], + "SynPerChatbotRAGFAQRetrieval": ["train"], + "Farsick": ["train"], + "SynPerSTS": ["train"], + "Query2Query": ["train"], + }, +) + + +hakim_small = ModelMeta( + loader=HakimModelWrapper, + loader_kwargs=dict( + api_model_name="hakim-small", + ), + name="MCINext/Hakim-small", + languages=["fas-Arab"], + open_weights=False, + revision="1", + release_date="2025-05-10", + n_parameters=38_736_384, + memory_usage_mb=148, + embed_dim=512, + license="not specified", + max_tokens=512, + reference="https://huggingface.co/MCINext/Hakim-small", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets={ + "FarsTail": [], + "SAMSumFa": ["train"], + "SynPerChatbotSumSRetrieval": ["train"], + "SynPerChatbotRAGSumSRetrieval": ["train"], + "SynPerChatbotConvSAClassification": ["train"], + "SynPerChatbotConvSAToneChatbotClassification": ["train"], + "SynPerChatbotConvSAToneUserClassification": ["train"], + "SynPerChatbotSatisfactionLevelClassification": ["train"], + "SynPerChatbotRAGToneChatbotClassification": ["train"], + "SynPerChatbotRAGToneUserClassification": ["train"], + "SynPerChatbotToneChatbotClassification": ["train"], + "SynPerChatbotToneUserClassification": ["train"], + "SynPerTextToneClassification": ["train"], + "SIDClassification": ["train"], + "PersianTextEmotion": ["train"], + "SentimentDKSF": ["train"], + "NLPTwitterAnalysisClassification": ["train"], + "DigikalamagClassification": ["train"], + "DigikalamagClustering": ["train"], + "NLPTwitterAnalysisClustering": ["train"], + "SIDClustring": ["train"], + "CExaPPC": ["train"], + "SynPerChatbotRAGFAQPC": ["train"], + "FarsiParaphraseDetection": ["train"], + "SynPerTextKeywordsPC": ["train"], + "SynPerQAPC": ["train"], + "ParsinluEntail": ["train"], + "ParsinluQueryParaphPC": ["train"], + "FiQA2018-Fa": ["train"], + "HotpotQA-Fa": ["train"], + "MSMARCO-Fa": ["train"], + "NFCorpus-Fa": ["train"], + "SciFact-Fa": ["train"], + "SynPerQARetrieval": ["train"], + "SynPerChatbotTopicsRetrieval": ["train"], + "SynPerChatbotRAGTopicsRetrieval": ["train"], + "SynPerChatbotRAGFAQRetrieval": ["train"], + "Farsick": ["train"], + "SynPerSTS": ["train"], + "Query2Query": ["train"], + }, +) + +hakim_unsup = ModelMeta( + loader=HakimModelWrapper, + loader_kwargs=dict( + api_model_name="hakim-unsup", + ), + name="MCINext/Hakim-unsup", + languages=["fas-Arab"], + open_weights=False, + revision="1", + release_date="2025-05-10", + n_parameters=124_441_344, + memory_usage_mb=475, + embed_dim=768, + license="not specified", + max_tokens=512, + reference="https://huggingface.co/MCINext/Hakim-unsup", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets={ + "FarsTail": [], + "Farsick": ["train"], + "MSMARCO-Fa": ["train"], + "Query2Query": ["train"], + }, +) diff --git a/mteb/models/model_implementations/misc_models.py b/mteb/models/model_implementations/misc_models.py index ddd508efd6..1c65ccc201 100644 --- a/mteb/models/model_implementations/misc_models.py +++ b/mteb/models/model_implementations/misc_models.py @@ -125,152 +125,6 @@ superseded_by=None, ) -kalm_training_data = { - # from technical report - # not in MTEB: - # ExpertQA - # MEDI2BGE - # OpenOrca - # PAQ - # PubMedQA - # SearchQA - # arxiv_qa - # rag-dataset-12000 - # CC-News - # SQuAD 2.0 - # TriviaQA - # WebGPT Comparisons - # MultiNLI - # NLLB - # WikiAnswers - # SimCSE NLI - # SNLI - # Aya Dataset - # eli5 - # ---- - # in MTEB: - "CodeFeedbackMT": ["train"], - "CodeFeedbackST": ["train"], - "ArxivClusteringP2P": ["train"], - "ArxivClusteringS2S": ["train"], - "ArxivClusteringP2P.v2": ["train"], - "TRECCOVID": ["train"], - "DBPedia": ["train"], - "ESCIReranking": ["train"], - "FEVER": ["train"], - "FiQA2018": ["train"], - "FEVERHardNegatives": ["train"], - "NanoFEVERRetrieval": ["train"], - "FEVER-NL": ["train"], # translation not trained on - "FiQA2018-NL": ["train"], # translation not trained on - "HotpotQA-PL": ["train"], # translation not trained on - "HotpotQA-NL": ["train"], # translation not trained on - "HotpotQAHardNegatives": ["train"], - "MultiLongDocRetrieval": ["train"], - "MSMARCO": ["train"], - "MSMARCOHardNegatives": ["train"], - "NanoMSMARCORetrieval": ["train"], - "MSMARCO-PL": ["train"], # translation not trained on - "mMARCO-NL": ["train"], # translation not trained on - "MSMARCOv2": ["train"], - "NFCorpus": ["train"], - "SciFact": ["train"], - "NQ": ["train"], - "NQHardNegatives": ["train"], - "NanoNQRetrieval": ["train"], - "NQ-PL": ["train"], # translation not trained on - "NQ-NL": ["train"], # translation not trained on - "YahooAnswersTopicsClassification": ["train"], - "ContractNLIConfidentialityOfAgreementLegalBenchClassification": ["train"], - "ContractNLIExplicitIdentificationLegalBenchClassification": ["train"], - "ContractNLIInclusionOfVerballyConveyedInformationLegalBenchClassification": [ - "train" - ], - "ContractNLILimitedUseLegalBenchClassification": ["train"], - "ContractNLINoLicensingLegalBenchClassification": ["train"], - "ContractNLINoticeOnCompelledDisclosureLegalBenchClassification": ["train"], - "ContractNLIPermissibleAcquirementOfSimilarInformationLegalBenchClassification": [ - "train" - ], - "ContractNLIPermissibleCopyLegalBenchClassification": ["train"], - "ContractNLIPermissibleDevelopmentOfSimilarInformationLegalBenchClassification": [ - "train" - ], - "ContractNLIPermissiblePostAgreementPossessionLegalBenchClassification": ["train"], - "ContractNLIReturnOfConfidentialInformationLegalBenchClassification": ["train"], - "ContractNLISharingWithEmployeesLegalBenchClassification": ["train"], - "ContractNLISharingWithThirdPartiesLegalBenchClassification": ["train"], - "ContractNLISurvivalOfObligationsLegalBenchClassification": ["train"], - "QuoraRetrieval": ["train"], - "NanoQuoraRetrieval": ["train"], - "BiorxivClusteringP2P.v2": ["train"], - "BiorxivClusteringS2S.v2": ["train"], - "MedrxivClusteringP2P.v2": ["train"], - "MedrxivClusteringS2S.v2": ["train"], - "Banking77Classification": ["train"], - "AmazonPolarityClassification": ["train"], - "ImdbClassification": ["train"], - "EmotionClassification": ["train"], - "TweetSentimentExtractionClassification": ["train"], - "ToxicConversationsClassification": ["train"], - "MIRACLRetrieval": ["train"], - "MIRACLRetrievalHardNegatives": ["train"], - "MIRACLReranking": ["train"], - "MrTidyRetrieval": ["train"], - "PawsXPairClassification": ["train"], - "AmazonReviewsClassification": ["train"], - "AmazonCounterfactualClassification": ["train"], - "MultilingualSentiment": ["train"], - "MassiveIntentClassification": ["train"], - "MassiveScenarioClassification": ["train"], - "MTOPDomainClassification": ["train"], - "MTOPIntentClassification": ["train"], -} - -HIT_TMG__KaLM_embedding_multilingual_mini_instruct_v1 = ModelMeta( - name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", - revision="45e42c89990c40aca042659133fc8b13c28634b5", - release_date="2024-10-23", - languages=None, - loader=sentence_transformers_loader, - n_parameters=494032768, - memory_usage_mb=1885, - max_tokens=512, - embed_dim=896, - license="mit", - open_weights=True, - public_training_code=None, - public_training_data=None, - framework=["PyTorch"], - reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", - similarity_fn_name=ScoringFunction.COSINE, - use_instructions=None, - training_datasets=kalm_training_data, - adapted_from="/mnt/shgeminicephfs/wx-dc-plt-hpc/xinshuohu/Output/Embedding/Qwen2-0.5B-eos_mean_pretrain_0806_1e-4_uen_sft_1022_filtered_v2_inst_3node_g8_1e-5_sin-0.1_mrl", - superseded_by=None, -) -HIT_TMG__KaLM_embedding_multilingual_mini_v1 = ModelMeta( - name="HIT-TMG/KaLM-embedding-multilingual-mini-v1", - revision="8a82a0cd2b322b91723e252486f7cce6fd8ac9d3", - release_date="2024-08-27", - languages=None, - loader=sentence_transformers_loader, - n_parameters=494032768, - memory_usage_mb=1885, - max_tokens=512, - embed_dim=896, - license="mit", - open_weights=True, - public_training_code=None, - public_training_data=None, - framework=["PyTorch"], - reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-v1", - similarity_fn_name=ScoringFunction.COSINE, - use_instructions=None, - training_datasets=kalm_training_data, - adapted_from="/mnt/shgeminicephfs/wx-dc-plt-hpc/xinshuohu/Output/Embedding/Qwen2-0.5B-eos_mean_pretrain_0806_1e-4_uen_sft_0902_filtered_v2_3node_g8_1e-5_sin-0.1", - superseded_by=None, -) Hum_Works__lodestone_base_4096_v1 = ModelMeta( name="Hum-Works/lodestone-base-4096-v1", revision="9bbc2d0b57dd2198aea029404b0f976712a7d966", diff --git a/mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py b/mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py new file mode 100644 index 0000000000..550595df4e --- /dev/null +++ b/mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py @@ -0,0 +1,177 @@ +from __future__ import annotations + +from typing import Any + +import torch +from PIL import Image +from torch.utils.data import DataLoader +from transformers import AutoModel + +from mteb.abstasks.task_metadata import TaskMetadata +from mteb.models.abs_encoder import AbsEncoder +from mteb.models.model_meta import ModelMeta +from mteb.types import Array, BatchedInput, PromptType + + +class llama_nemoretriever_colembed(AbsEncoder): + def __init__( + self, + model_name_or_path: str, + revision: str, + trust_remote_code: bool, + device_map="cuda", + torch_dtype=torch.bfloat16, + attn_implementation="flash_attention_2", + **kwargs, + ): + self.model = AutoModel.from_pretrained( + model_name_or_path, + revision=revision, + device_map=device_map, + trust_remote_code=trust_remote_code, + torch_dtype=torch_dtype, + attn_implementation=attn_implementation, + ).eval() + + def get_text_embeddings(self, texts, batch_size: int = 32, **kwargs): + batch_size = 1 + return self.model.forward_queries(texts, batch_size=batch_size) + + def get_image_embeddings( + self, + images, + batch_size: int = 32, + **kwargs, + ): + import torchvision.transforms.functional as F + + all_images = [] + if isinstance(images, DataLoader): + iterator = images + else: + iterator = DataLoader(images, batch_size=batch_size) + + for batch in iterator: + for b in batch: + pil_img = ( + F.to_pil_image(b.to("cpu")) if not isinstance(b, Image.Image) else b + ) + all_images.append(pil_img) + + batch_size = 1 + return self.model.forward_passages(all_images, batch_size=batch_size) + + def calculate_probs(self, text_embeddings, image_embeddings): + scores = self.similarity(text_embeddings, image_embeddings) + return (scores * 100).softmax(dim=-1) + + def similarity(self, a, b): + return self.model.get_scores(a, b) + + def get_fused_embeddings( + self, + *args, + **kwargs, + ): + raise NotImplementedError( + "Fused embeddings are not supported yet. Please use get_text_embeddings or get_image_embeddings." + ) + + def encode( + self, + inputs: DataLoader[BatchedInput], + *, + task_metadata: TaskMetadata, + hf_split: str, + hf_subset: str, + prompt_type: PromptType | None = None, + **kwargs: Any, + ) -> Array: + text_embeddings = None + image_embeddings = None + + if "text" in inputs.dataset.features: + text_embeddings = self.get_text_embeddings(inputs, **kwargs) + if "image" in inputs.dataset.features: + image_embeddings = self.get_image_embeddings(inputs, **kwargs) + + if text_embeddings is not None and image_embeddings is not None: + raise NotImplementedError( + "Fused embeddings are not supported yet. Please use get_text_embeddings or get_image_embeddings." + ) + elif text_embeddings is not None: + return text_embeddings + elif image_embeddings is not None: + return image_embeddings + raise ValueError + + +TRAINING_DATA = { + # from https://huggingface.co/datasets/vidore/colpali_train_set + "DocVQA": ["train"], + "InfoVQA": ["train"], + "TATDQA": ["train"], + "arXivQA": ["train"], + "hotpotqa": ["train"], + "miracl": ["train"], + "NQ": ["train"], + "stackexchange": ["train"], + "SQuAD": ["train"], + "WebInstructSub": ["train"], + "docmatix-ir": ["train"], + "vdr-multilingual-train": ["train"], + "colpali_train_set": ["train"], # as it contains PDFs + "VisRAG-Ret-Train-Synthetic-data": ["train"], + "VisRAG-Ret-Train-In-domain-data": ["train"], + "wiki-ss-nq": ["train"], +} + +llama_nemoretriever_colembed_1b_v1 = ModelMeta( + loader=llama_nemoretriever_colembed, + loader_kwargs=dict( + trust_remote_code=True, + ), + name="nvidia/llama-nemoretriever-colembed-1b-v1", + languages=["eng-Latn"], + revision="1f0fdea7f5b19532a750be109b19072d719b8177", + release_date="2025-06-27", + modalities=["image", "text"], + n_parameters=2_418_000_000, + memory_usage_mb=9224, + max_tokens=8192, + embed_dim=2048, + license="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE", + open_weights=True, + public_training_code="Proprietary Code", + public_training_data="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1#training-dataset", + framework=["PyTorch"], + reference="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1", + similarity_fn_name="MaxSim", + use_instructions=True, + training_datasets=TRAINING_DATA, +) + +llama_nemoretriever_colembed_3b_v1 = ModelMeta( + loader=llama_nemoretriever_colembed, + loader_kwargs=dict( + trust_remote_code=True, + ), + name="nvidia/llama-nemoretriever-colembed-3b-v1", + languages=["eng-Latn"], + revision="50c36f4d5271c6851aa08bd26d69f6e7ca8b870c", + release_date="2025-06-27", + modalities=["image", "text"], + n_parameters=4_407_000_000, + memory_usage_mb=16811, + max_tokens=8192, + embed_dim=3072, + license="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE", + open_weights=True, + public_training_code="Proprietary Code", + public_training_data="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1#training-dataset", + framework=["PyTorch"], + reference="https://huggingface.co/nvidia/llama-nemoretriever-colembed-3b-v1", + similarity_fn_name="MaxSim", + use_instructions=True, + training_datasets=TRAINING_DATA, +) diff --git a/mteb/models/model_implementations/openai_models.py b/mteb/models/model_implementations/openai_models.py index 5c44ba469b..6fa3a66674 100644 --- a/mteb/models/model_implementations/openai_models.py +++ b/mteb/models/model_implementations/openai_models.py @@ -53,8 +53,6 @@ def __init__( import tiktoken self._client = OpenAI() if client is None else client - self._model_name = model_name.split("/")[-1] - self._embed_dim = embed_dim if embed_dim is None: if model_name not in self.default_embed_dims: diff --git a/mteb/models/model_implementations/ru_sentence_models.py b/mteb/models/model_implementations/ru_sentence_models.py index 896597525e..837a7e3807 100644 --- a/mteb/models/model_implementations/ru_sentence_models.py +++ b/mteb/models/model_implementations/ru_sentence_models.py @@ -4,6 +4,7 @@ import torch +from mteb.models.instruct_wrapper import InstructSentenceTransformerModel from mteb.models.model_meta import ( ModelMeta, ScoringFunction, @@ -16,6 +17,47 @@ nomic_training_data, ) +GIGA_task_prompts = { + "TERRa": "Given a premise, retrieve a hypothesis that is entailed by the premise\nquery: ", + "STS22": "Retrieve semantically similar text\nquery: ", + "RuSTSBenchmarkSTS": "Retrieve semantically similar text\nquery: ", + "RUParaPhraserSTS": "Retrieve semantically similar text\nquery: ", + "CEDRClassification": "Дан комментарий, определи выраженную в нем эмоцию (радость, грусть, удивление, страх, гнев или нейтрально) \nкомментарий: ", + "GeoreviewClassification": "Classify the organization rating based on the reviews\nquery: ", + "GeoreviewClusteringP2P": "Классифицируй рейтинг организации на основе отзыва \nотзыв: ", + "HeadlineClassification": "Классифицируй тему данного новостного заголовка \nзаголовок: ", + "InappropriatenessClassification": "Классифицируй данный комментарий как токсичный или не токсичный \nкомментарий: ", + "KinopoiskClassification": "Classify the sentiment expressed in the given movie review text\nquery: ", + "MassiveIntentClassification": "Given a user utterance as query, find the user intents\nquery: ", + "MassiveScenarioClassification": "Given a user utterance as query, find the user scenarios\nquery: ", + "RuReviewsClassification": "Classify product reviews into positive, negative or neutral sentiment\nquery: ", + "RuSciBenchGRNTIClassification": "Classify the category of scientific papers based on the titles and abstracts\nquery: ", + "RuSciBenchGRNTIClusteringP2P": "Классифицируй категорию научной статьи основываясь на аннотации \nаннотация: ", + "RuSciBenchOECDClassification": "Classify the category of scientific papers based on the titles and abstracts\nquery: ", + "RuSciBenchOECDClusteringP2P": "Классифицируй категорию научной статьи основываясь на аннотации \nаннотация: ", + "SensitiveTopicsClassification": "Классифицируй чувствительную тему по запросу \nзапрос: ", + "RuBQRetrieval": { + "query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ", + "passage": "", + }, + "RuBQReranking": { + "query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ", + "passage": "", + }, + "RiaNewsRetrieval": { + "query": "Given a news title, retrieve relevant news article\nquery: ", + "passage": "", + }, + "MIRACLReranking": { + "query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ", + "passage": "", + }, + "MIRACLRetrieval": { + "query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ", + "passage": "", + }, +} + rubert_tiny = ModelMeta( loader=sentence_transformers_loader, name="cointegrated/rubert-tiny", @@ -683,11 +725,13 @@ ) giga_embeddings = ModelMeta( - loader=sentence_transformers_loader, + loader=InstructSentenceTransformerModel, loader_kwargs=dict( - trust_remote_code=True, - instruction_template="Instruct: {instruction}\nQuery: ", + instruction_template="{instruction}", + max_seq_length=512, apply_instruction_to_passages=False, + prompts_dict=GIGA_task_prompts, + trust_remote_code=True, model_kwargs={ "torch_dtype": torch.bfloat16, }, diff --git a/mteb/models/model_implementations/seed_1_6_embedding_models.py b/mteb/models/model_implementations/seed_1_6_embedding_models.py new file mode 100644 index 0000000000..ba6f874805 --- /dev/null +++ b/mteb/models/model_implementations/seed_1_6_embedding_models.py @@ -0,0 +1,430 @@ +from __future__ import annotations + +import base64 +import logging +import os +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from io import BytesIO +from typing import Any + +import requests +import torch +from PIL import Image +from torch.utils.data import DataLoader + +from mteb.abstasks.task_metadata import TaskMetadata +from mteb.models.abs_encoder import AbsEncoder +from mteb.models.model_implementations.bge_models import bge_chinese_training_data +from mteb.models.model_implementations.nvidia_models import nvidia_training_datasets +from mteb.models.model_meta import ModelMeta +from mteb.requires_package import requires_package +from mteb.types import Array, BatchedInput, PromptType + +logger = logging.getLogger(__name__) + + +def pil_to_base64(image, format="jpeg"): + buffer = BytesIO() + image.save(buffer, format=format) + img_bytes = buffer.getvalue() + encoded_bytes = base64.b64encode(img_bytes) + return encoded_bytes.decode("utf-8") + + +def multimodal_embedding(image_base64=None, text_content=None): + auth_token = os.getenv("VOLCES_AUTH_TOKEN") + model_name = "doubao-embedding-vision-250615" + api_url = "https://ark.cn-beijing.volces.com/api/v3/embeddings/multimodal" + + headers = { + "Authorization": f"Bearer {auth_token}", + "x-ark-vlm1": "true", + "Content-Type": "application/json", + } + + if image_base64 is not None and text_content is None: + inputs = [] + for image in image_base64: + image_format = "jpeg" + image_data = f"data:image/{image_format};base64,{image}" + inputs.append({"type": "image_url", "image_url": {"url": image_data}}) + + payload = {"model": model_name, "input": inputs} + elif image_base64 is None and text_content is not None: + payload = { + "model": model_name, + "input": [ + {"type": "text", "text": text_content}, + ], + } + else: + inputs = [] + for image in image_base64: + image_format = "jpeg" + image_data = f"data:image/{image_format};base64,{image}" + inputs.append({"type": "image_url", "image_url": {"url": image_data}}) + inputs.append({"type": "text", "text": text_content}) + payload = {"model": model_name, "input": inputs} + + try: + response = requests.post(url=api_url, headers=headers, json=payload, timeout=10) + + response.raise_for_status() + return response.json() + + except requests.exceptions.HTTPError as http_err: + logger.error(f"HTTP error ({http_err.response.status_code}): {http_err}") + except requests.exceptions.JSONDecodeError: + logger.error("Error:The response is not in valid JSON format") + except requests.exceptions.Timeout: + logger.error("Error:Request timeout") + except Exception as e: + logger.error(f"Unknown error: {str(e)}") + + return None + + +def multi_thread_encode(sentences, batch_size=1, max_workers=8): + batches = [] + for idx in range(0, len(sentences), batch_size): + batches.append((idx // batch_size, sentences[idx : idx + batch_size])) + + n_batches = len(batches) + results = [None] * n_batches # Pre-allocated result list + all_embeddings = [] # Final ordered embeddings + + def _process_batch(batch_idx, batch_sentences): + sentence = batch_sentences[0] + + retries = 5 + while retries > 0: + try: + resp = multimodal_embedding(text_content=sentence) + embedding = torch.tensor(resp["data"]["embedding"]) + break + except Exception as e: + time.sleep(1) + logger.warning(f"Retrying... {retries} retries left. Error: {str(e)}") + retries -= 1 + if retries == 0: + raise e + return batch_idx, embedding + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = { + executor.submit(_process_batch, idx, batch): idx for idx, batch in batches + } + + for future in as_completed(futures): + batch_idx, embeddings = future.result() + results[batch_idx] = embeddings + + for batch_embeddings in results: + all_embeddings.append(batch_embeddings) + + all_embeddings = torch.stack(all_embeddings, dim=0) + all_embeddings = torch.nn.functional.normalize(all_embeddings, dim=-1) + + return all_embeddings.float().cpu() + + +doubao_embedding_training_data = ( + { + "PAWSX": ["train"], + "QBQTC": ["train"], + "STSB": ["train"], + "TNews": ["train"], + "Waimai": ["train"], + "IFlyTek": ["train"], + "MassiveScenarioClassification": ["train"], + } + | bge_chinese_training_data + | nvidia_training_datasets +) + + +class Seed16EmbeddingWrapper(AbsEncoder): + def __init__( + self, + model_name: str, + revision: str, + max_tokens: int, + tokenizer_name: str = "cl100k_base", + embed_dim: int | None = None, + available_embed_dims: list[int | None] = [None], + **kwargs, + ) -> None: + """Wrapper for Seed embedding API.""" + requires_package( + self, + "volcenginesdkarkruntime", + "pip install mteb[ark]", + "tiktoken", + ) + import tiktoken + + self._model_name = model_name + self._max_tokens = max_tokens + self._embed_dim = embed_dim + self._available_embed_dims = available_embed_dims + self._encoding = tiktoken.get_encoding(tokenizer_name) + + def truncate_text_tokens(self, text): + """Truncate a string to have `max_tokens` according to the given encoding.""" + truncated_sentence = self._encoding.encode(text)[: self._max_tokens] + return self._encoding.decode(truncated_sentence) + + def get_text_embeddings( + self, + sentences: list[str], + *, + task_name: str | None = None, + prompt_type: PromptType | None = None, + batch_size: int = 32, + ) -> Array: + trimmed_sentences = [] + for sentence in sentences: + encoded_sentence = self._encoding.encode(sentence) + if len(encoded_sentence) > self._max_tokens: + truncated_sentence = self.truncate_text_tokens(sentence) + trimmed_sentences.append(truncated_sentence) + else: + trimmed_sentences.append(sentence) + + assert ( + self._embed_dim is None or self._embed_dim in self._available_embed_dims + ), ( + f"Available embed_dims are {self._available_embed_dims}, found {self._embed_dim}" + ) + + if ( + prompt_type == PromptType("query") or prompt_type is None + ) and task_name in TASK_NAME_TO_INSTRUCTION: + instruction = TASK_NAME_TO_INSTRUCTION[task_name] + trimmed_sentences = [instruction.format(i) for i in trimmed_sentences] + + outputs = multi_thread_encode(trimmed_sentences) + + if self._embed_dim is not None: + outputs = outputs[:, : self._embed_dim] + outputs = torch.nn.functional.normalize(outputs, p=2, dim=1) + + return outputs.float() + + def get_image_embeddings( + self, + images: list[Image.Image], + *, + task_name: str | None = None, + prompt_type: PromptType | None = None, + batch_size: int = 32, + **kwargs: Any, + ) -> Array: + import torchvision.transforms.functional as F + + assert ( + self._embed_dim is None or self._embed_dim in self._available_embed_dims + ), ( + f"Available embed_dims are {self._available_embed_dims}, found {self._embed_dim}" + ) + + if ( + prompt_type == PromptType("query") or prompt_type is None + ) and task_name in TASK_NAME_TO_INSTRUCTION: + instruction = TASK_NAME_TO_INSTRUCTION[task_name] + else: + instruction = "" + + if isinstance(images, DataLoader): + images_base64 = [] + for batch in images: + images_base64.extend([pil_to_base64(F.to_pil_image(b)) for b in batch]) + else: + images_base64 = [pil_to_base64(image) for image in images] + outputs = [] + for image in images_base64: + if instruction == "": + resp = multimodal_embedding(image_base64=[image]) + else: + resp = multimodal_embedding( + image_base64=[image], text_content=instruction + ) + embedding = torch.tensor(resp["data"]["embedding"]) + embedding = torch.reshape(embedding, (1, -1)) + + outputs = torch.stack(outputs, dim=0) + + if self._embed_dim is not None: + outputs = outputs[:, : self._embed_dim] + outputs = torch.nn.functional.normalize(outputs, p=2, dim=1) + return outputs.float() + + def get_fused_embeddings( + self, + texts: list[str] | None = None, + images: list[Image.Image] | DataLoader | None = None, + fusion_mode="sum", + **kwargs: Any, + ) -> Array: + assert ( + self._embed_dim is None or self._embed_dim in self._available_embed_dims + ), ( + f"Available embed_dims are {self._available_embed_dims}, found {self._embed_dim}" + ) + + assert len(texts) == len(images) + images_base64 = [pil_to_base64(image) for image in images] + + outputs = [] + for i in range(len(images_base64)): + resp = multimodal_embedding( + image_base64=[images_base64[i]], text_content=texts[i] + ) + embedding = torch.tensor(resp["data"]["embedding"]) + embedding = torch.reshape(embedding, (1, -1)) + + outputs = torch.stack(outputs, dim=0) + + if self._embed_dim is not None: + outputs = outputs[:, : self._embed_dim] + outputs = torch.nn.functional.normalize(outputs, p=2, dim=1) + return outputs.float() + + def encode( + self, + inputs: DataLoader[BatchedInput], + *, + task_metadata: TaskMetadata, + hf_split: str, + hf_subset: str, + prompt_type: PromptType | None = None, + **kwargs: Any, + ) -> Array: + if "text" in inputs.dataset.features and "image" in inputs.dataset.features: + sentences = [text for batch in inputs for text in batch["text"]] + images = [image for batch in inputs for image in batch["image"]] + return self.get_fused_embeddings( + texts=sentences, + images=images, + **kwargs, + ) + if "text" in inputs.dataset.features: + sentences = [text for batch in inputs for text in batch["text"]] + return self.get_text_embeddings( + sentences, + task_name=task_metadata.name, + prompt_type=prompt_type, + **kwargs, + ) + if "image" in inputs.dataset.features: + images = [image for batch in inputs for image in batch["image"]] + return self.get_image_embeddings( + images, task_name=task_metadata.name, prompt_type=prompt_type, **kwargs + ) + raise ValueError + + +TASK_NAME_TO_INSTRUCTION = { + "ArguAna": "Given a claim, find documents that refute the claim\n{}", + "ClimateFEVERHardNegatives": "Given a claim, retrieve documents that support or refute the claim\n{}", + "FEVERHardNegatives": "Given a claim, retrieve documents that support or refute the claim\n{}", + "FiQA2018": "Given a financial question, retrieve user replies that best answer the question\n{}", + "HotpotQAHardNegatives": "Given a multi-hop question, retrieve documents that can help answer the question\n{}", + "SCIDOCS": "Given a title of a scientific paper, retrieve the titles of other relevant papers\n{}", + "Touche2020Retrieval.v3": "Given a question, retrieve detailed and persuasive arguments that answer the question\n{}", + "TRECCOVID": "Given a query on COVID-19, retrieve documents that answer the query\n{}", + "AskUbuntuDupQuestions": "Retrieve duplicate questions from AskUbuntu forum\n{}", + "MindSmallReranking": "Retrieve relevant news articles based on user browsing history\n{}", + "SprintDuplicateQuestions": "Retrieve semantically similar text\n{}", + "TwitterSemEval2015": "Retrieve semantically similar text\n{}", + "TwitterURLCorpus": "Retrieve semantically similar text\n{}", + "CQADupstackGamingRetrieval": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given questionn\n{}", + "CQADupstackUnixRetrieval": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question\n{}", + "DuRetrieval": "为这个句子生成表示以用于检索相关内容:{}", + "T2Retrieval": "为这个句子生成表示以用于检索相关内容:{}", + "MMarcoRetrieval": "为这个句子生成表示以用于检索相关内容:{}", + "MMarcoReranking": "为这个句子生成表示以用于检索相关内容:{}", + "T2Reranking": "为这个句子生成表示以用于检索相关内容:{}", + "CMedQAv1-reranking": "为这个句子生成表示以用于检索相关内容:{}", + "CMedQAv2-reranking": "为这个句子生成表示以用于检索相关内容:{}", + "CovidRetrieval": "为这个句子生成表示以用于检索相关内容:{}", + "CmedqaRetrieval": "为这个句子生成表示以用于检索相关内容:{}", + "VideoRetrieval": "为这个句子生成表示以用于检索相关内容:{}", + "EcomRetrieval": "为这个句子生成表示以用于检索相关内容:{}", + "MedicalRetrieval": "为这个句子生成表示以用于检索相关内容:{}", + "ATEC": "Retrieve semantically similar text\n{}", + "BQ": "Retrieve semantically similar text\n{}", + "LCQMC": "Retrieve semantically similar text\n{}", + "PAWSX": "Retrieve semantically similar text\n{}", + "STSB": "Retrieve semantically similar text\n{}", + "AFQMC": "Retrieve semantically similar text\n{}", + "QBQTC": "Retrieve semantically similar text\n{}", + "STS22.v2": "Retrieve semantically similar text\n{}", + "BIOSSES": "Retrieve semantically similar text\n{}", + "SICK-R": "Retrieve semantically similar text\n{}", + "STS12": "Retrieve semantically similar text\n{}", + "STS13": "Retrieve semantically similar text\n{}", + "STS14": "Retrieve semantically similar text\n{}", + "STS15": "Retrieve semantically similar text\n{}", + "STS17": "Retrieve semantically similar text\n{}", + "STSBenchmark": "Retrieve semantically similar text\n{}", + "SummEvalSummarization.v2": "Retrieve semantically similar text\n{}", + "Ocnli": "Retrieve semantically similar text\n{}", + "Cmnli": "Retrieve semantically similar text\n{}", + "TNews": "Classify the fine-grained category of the given news title\n{}", + "IFlyTek": "Given an App description text, find the appropriate fine-grained category\n{}", + "MultilingualSentiment": "Classify sentiment of the customer review into positive, neutral, or negative\n{}", + "JDReview": "Classify the customer review for iPhone on e-commerce platform into positive or negative\n{}", + "OnlineShopping": "Classify the customer review for online shopping into positive or negative\n{}", + "Waimai": "Classify the customer review from a food takeaway platform into positive or negative\n{}", + "AmazonCounterfactualClassification": "Classify a given Amazon customer review text as either counterfactual or not-counterfactual\n{}", + "Banking77Classification": "Given a online banking query, find the corresponding intents\n{}", + "ImdbClassification": "Classify the sentiment expressed in the given movie review text from the IMDB dataset\n{}", + "MassiveIntentClassification": "Given a user utterance as query, find the user intents\n{}", + "MassiveScenarioClassification": "Given a user utterance as query, find the user scenarios\n{}", + "MTOPDomainClassification": "Classify the intent domain of the given utterance in task-oriented conversation\n{}", + "ToxicConversationsClassification": "Classify the given comments as either toxic or not toxic\n{}", + "TweetSentimentExtractionClassification": "Classify the sentiment of a given tweet as either positive, negative, or neutral\n{}", + "ArXivHierarchicalClusteringP2P": "Identify the main and secondary category of Arxiv papers based on the titles and abstracts\n{}", + "ArXivHierarchicalClusteringS2S": "Identify the main and secondary category of Arxiv papers based on the titles\n{}", + "BiorxivClusteringP2P.v2": "Identify the main category of Biorxiv papers based on the titles and abstracts\n{}", + "MedrxivClusteringP2P.v2": "Identify the main category of Medrxiv papers based on the titles and abstracts\n{}", + "MedrxivClusteringS2S.v2": "Identify the main category of Medrxiv papers based on the titles\n{}", + "StackExchangeClustering.v2": "Identify the topic or theme of StackExchange posts based on the titles\n{}", + "StackExchangeClusteringP2P.v2": "Identify the topic or theme of StackExchange posts based on the given paragraphs\n{}", + "TwentyNewsgroupsClustering.v2": "Identify the topic or theme of the given news articles\n{}", + "CLSClusteringS2S": "Identify the main category of scholar papers based on the titles\n{}", + "CLSClusteringP2P": "Identify the main category of scholar papers based on the titles and abstracts\n{}", + "ThuNewsClusteringS2S": "Identify the topic or theme of the given news articles based on the titles\n{}", + "ThuNewsClusteringP2P": "Identify the topic or theme of the given news articles based on the titles and contents\n{}", +} + + +seed_embedding = ModelMeta( + name="Bytedance/Seed1.6-embedding", + revision="1", + release_date="2025-06-18", + languages=[ + "eng-Latn", + "zho-Hans", + ], + loader=Seed16EmbeddingWrapper, + loader_kwargs=dict( + max_tokens=32000, + available_embed_dims=[2048, 1024], + ), + max_tokens=32768, + embed_dim=2048, + open_weights=False, + n_parameters=None, + memory_usage_mb=None, + license=None, + reference="https://seed1-6-embedding.github.io/", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=True, + training_datasets=doubao_embedding_training_data, + public_training_code=None, + public_training_data=None, +) diff --git a/mteb/models/model_implementations/shuu_model.py b/mteb/models/model_implementations/shuu_model.py index 531ce00790..256824108b 100644 --- a/mteb/models/model_implementations/shuu_model.py +++ b/mteb/models/model_implementations/shuu_model.py @@ -1,16 +1,10 @@ from __future__ import annotations -from functools import partial - from mteb.models.model_meta import ModelMeta from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader codemodernbert_crow_meta = ModelMeta( - loader=partial( - sentence_transformers_loader, - model_name="Shuu12121/CodeSearch-ModernBERT-Crow-Plus", - revision="044a7a4b552f86e284817234c336bccf16f895ce", - ), + loader=sentence_transformers_loader, name="Shuu12121/CodeSearch-ModernBERT-Crow-Plus", languages=["eng-Latn"], open_weights=True, diff --git a/mteb/models/sentence_transformer_wrapper.py b/mteb/models/sentence_transformer_wrapper.py index d129f0589f..35d4984901 100644 --- a/mteb/models/sentence_transformer_wrapper.py +++ b/mteb/models/sentence_transformer_wrapper.py @@ -13,7 +13,7 @@ from mteb.types import Array, BatchedInput, PromptType if TYPE_CHECKING: - from mteb import TaskMetadata + from mteb.abstasks.task_metadata import TaskMetadata logger = logging.getLogger(__name__) diff --git a/mteb/tasks/Classification/ita/DadoEvalCoarseClassification.py b/mteb/tasks/Classification/ita/DadoEvalCoarseClassification.py new file mode 100644 index 0000000000..6eb47c36c2 --- /dev/null +++ b/mteb/tasks/Classification/ita/DadoEvalCoarseClassification.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from mteb.abstasks import AbsTaskAnyClassification +from mteb.abstasks.task_metadata import TaskMetadata + + +class DadoEvalCoarseClassification(AbsTaskAnyClassification): + metadata = TaskMetadata( + name="DadoEvalCoarseClassification", + dataset={ + "path": "MattiaSangermano/DaDoEval", + "revision": "7a78eb7cc137fdd1c5826be1a9e9813177706509", + }, + description="The DaDoEval dataset is a curated collection of 2,759 documents authored by Alcide De Gasperi, spanning the period from 1901 to 1954. Each document in the dataset is manually tagged with its date of issue.", + reference="https://github.com/dhfbk/DaDoEval", + type="Classification", + date=("1901-01-01", "1954-12-31"), + category="t2c", + modalities=["text"], + eval_splits=["test"], + eval_langs=["ita-Latn"], + main_score="accuracy", + domains=["Written"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{menini2020dadoeval, + author = {Menini, Stefano and Moretti, Giovanni and Sprugnoli, Rachele and Tonelli, Sara and others}, + booktitle = {Proceedings of the Seventh Evaluation Campaign of Natural Language Processing and Speech Tools for Italian. Final Workshop (EVALITA 2020)}, + organization = {Accademia University Press}, + pages = {391--397}, + title = {DaDoEval@ EVALITA 2020: Same-genre and cross-genre dating of historical documents}, + year = {2020}, +} +""", + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_column("class", "label") + unused_cols = [ + col + for col in self.dataset["test"].column_names + if col not in ["text", "label"] + ] + self.dataset = self.dataset.remove_columns(unused_cols) diff --git a/mteb/tasks/Classification/ita/SardiStanceClassification.py b/mteb/tasks/Classification/ita/SardiStanceClassification.py new file mode 100644 index 0000000000..1f1e950fb7 --- /dev/null +++ b/mteb/tasks/Classification/ita/SardiStanceClassification.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from mteb.abstasks import AbsTaskAnyClassification +from mteb.abstasks.task_metadata import TaskMetadata + + +class SardiStanceClassification(AbsTaskAnyClassification): + metadata = TaskMetadata( + name="SardiStanceClassification", + dataset={ + "path": "MattiaSangermano/SardiStance", + "revision": "e25d91e6f6a28ebef42212128f0d5e275b676233", + }, + description="SardiStance is a unique dataset designed for the task of stance detection in Italian tweets. It consists of tweets related to the Sardines movement, providing a valuable resource for researchers and practitioners in the field of NLP.", + reference="https://github.com/mirkolai/evalita-sardistance", + type="Classification", + category="t2c", + date=("2019-11-01", "2020-01-31"), + modalities=["text"], + eval_splits=["test"], + eval_langs=["ita-Latn"], + main_score="accuracy", + domains=["Social"], + task_subtypes=["Political classification"], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{cignarella2020sardistance, + author = {Cignarella, Alessandra Teresa and Lai, Mirko and Bosco, Cristina and Patti, Viviana and Rosso, Paolo and others}, + booktitle = {CEUR WORKSHOP PROCEEDINGS}, + organization = {Ceur}, + pages = {1--10}, + title = {Sardistance@ evalita2020: Overview of the task on stance detection in italian tweets}, + year = {2020}, +} +""", + ) + + def dataset_transform(self): + unused_cols = [ + col + for col in self.dataset["test"].column_names + if col not in ["text", "label"] + ] + self.dataset = self.dataset.remove_columns(unused_cols) diff --git a/mteb/tasks/Clustering/nob/SNLHierarchicalClustering.py b/mteb/tasks/Clustering/nob/SNLHierarchicalClustering.py index 3ca90ea7fb..98b86d252a 100644 --- a/mteb/tasks/Clustering/nob/SNLHierarchicalClustering.py +++ b/mteb/tasks/Clustering/nob/SNLHierarchicalClustering.py @@ -18,11 +18,11 @@ class SNLHierarchicalClusteringP2P(AbsTaskClusteringFast): metadata = TaskMetadata( name="SNLHierarchicalClusteringP2P", dataset={ - "path": "navjordj/SNL_summarization", - "revision": "3d3d27aa7af8941408cefc3991ada5d12a4273d1", + "path": "mteb/SNLHierarchicalClusteringP2P", + "revision": "693a321c42fb13ffe76bb9043f8d2aaa8f0a9499", }, description="Webscrabed articles from the Norwegian lexicon 'Det Store Norske Leksikon'. Uses articles categories as clusters.", - reference="https://huggingface.co/datasets/navjordj/SNL_summarization", + reference="https://huggingface.co/datasets/mteb/SNLHierarchicalClusteringP2P", type="Clustering", category="t2c", modalities=["text"], @@ -48,12 +48,6 @@ class SNLHierarchicalClusteringP2P(AbsTaskClusteringFast): ) max_depth = 5 - def dataset_transform(self) -> None: - self.dataset = self.dataset.rename_columns( - {"article": "sentences", "category": "labels"} - ) - self.dataset = self.dataset.map(split_labels) - class SNLHierarchicalClusteringS2S(AbsTaskClusteringFast): max_document_to_embed = 1300 @@ -62,11 +56,11 @@ class SNLHierarchicalClusteringS2S(AbsTaskClusteringFast): metadata = TaskMetadata( name="SNLHierarchicalClusteringS2S", dataset={ - "path": "navjordj/SNL_summarization", - "revision": "3d3d27aa7af8941408cefc3991ada5d12a4273d1", + "path": "mteb/SNLHierarchicalClusteringS2S", + "revision": "b505e4ce65f255228e49dd07b6f8148731c5dc64", }, description="Webscrabed articles from the Norwegian lexicon 'Det Store Norske Leksikon'. Uses articles categories as clusters.", - reference="https://huggingface.co/datasets/navjordj/SNL_summarization", + reference="https://huggingface.co/datasets/mteb/SNLHierarchicalClusteringS2S", type="Clustering", category="t2c", modalities=["text"], @@ -91,9 +85,3 @@ class SNLHierarchicalClusteringS2S(AbsTaskClusteringFast): prompt="Identify categories in a Norwegian lexicon", ) max_depth = 5 - - def dataset_transform(self) -> None: - self.dataset = self.dataset.rename_columns( - {"ingress": "sentences", "category": "labels"} - ) - self.dataset = self.dataset.map(split_labels) diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/MIRACLVisionRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/MIRACLVisionRetrieval.py index 5573dac403..1be89c2814 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/MIRACLVisionRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/MIRACLVisionRetrieval.py @@ -168,11 +168,11 @@ def load_data(self, **kwargs): return self.corpus, self.queries, self.relevant_docs = _load_miracl_data( - path=self.metadata_dict["dataset"]["path"], - splits=self.metadata_dict["eval_splits"], + path=self.metadata.dataset["path"], + splits=self.metadata.eval_splits[0], langs=self.hf_subsets, cache_dir=kwargs.get("cache_dir", None), - revision=self.metadata_dict["dataset"]["revision"], + revision=self.metadata.dataset["revision"], ) self.data_loaded = True diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/VdrMultilingualRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/VdrMultilingualRetrieval.py index a5a687d89f..d620e48aa5 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/VdrMultilingualRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/VdrMultilingualRetrieval.py @@ -138,14 +138,12 @@ def load_data(self, **kwargs): return self.corpus, self.queries, self.relevant_docs = _load_vdr_multilingual_data( - path=self.metadata_dict["dataset"]["path"], + path=self.metadata.dataset["path"], langs=self.hf_subsets, - split=self.metadata_dict["eval_splits"][0], + split=_EVAL_SPLIT, cache_dir=kwargs.get("cache_dir", None), - revision=self.metadata_dict["dataset"].get("revision", None), - trust_remote_code=self.metadata_dict["dataset"].get( - "trust_remote_code", False - ), + revision=self.metadata.dataset.get("revision", None), + trust_remote_code=self.metadata.dataset.get("trust_remote_code", False), ) self.data_loaded = True diff --git a/mteb/tasks/MultiLabelClassification/ita/EmitClassification.py b/mteb/tasks/MultiLabelClassification/ita/EmitClassification.py new file mode 100644 index 0000000000..bc3715bd67 --- /dev/null +++ b/mteb/tasks/MultiLabelClassification/ita/EmitClassification.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskMultilabelClassification import ( + AbsTaskMultilabelClassification, +) +from mteb.abstasks.task_metadata import TaskMetadata + + +class EmitClassification(AbsTaskMultilabelClassification): + metadata = TaskMetadata( + name="EmitClassification", + description="""The EMit dataset is a comprehensive resource for the detection of emotions in Italian social media texts. + The EMit dataset consists of social media messages about TV shows, TV series, music videos, and advertisements. + Each message is annotated with one or more of the 8 primary emotions defined by Plutchik + (anger, anticipation, disgust, fear, joy, sadness, surprise, trust), as well as an additional label “love.” + """, + reference="https://github.com/oaraque/emit", + dataset={ + "path": "MattiaSangermano/emit", + "revision": "b0ceff2da0ca463d5c8c97a4e1c6e40545a1c3a6", + }, + type="MultilabelClassification", + category="t2c", + modalities=["text"], + date=("2022-01-01", "2022-12-31"), + eval_splits=["test"], + eval_langs=["ita-Latn"], + main_score="accuracy", + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-sa-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{araque2023emit, + author = {Araque, O and Frenda, S and Sprugnoli, R and Nozza, D and Patti, V and others}, + booktitle = {CEUR WORKSHOP PROCEEDINGS}, + organization = {CEUR-WS}, + pages = {1--8}, + title = {EMit at EVALITA 2023: Overview of the Categorical Emotion Detection in Italian Social Media Task}, + volume = {3473}, + year = {2023}, +} +""", + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_columns({"emotion_labels": "label"}) + unused_cols = [ + col + for col in self.dataset["test"].column_names + if col not in ["text", "label"] + ] + self.dataset = self.dataset.remove_columns(unused_cols) diff --git a/mteb/tasks/MultiLabelClassification/ita/__init__.py b/mteb/tasks/MultiLabelClassification/ita/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/PairClassification/ita/DisCoTexPairClassification.py b/mteb/tasks/PairClassification/ita/DisCoTexPairClassification.py new file mode 100644 index 0000000000..bc737e9a79 --- /dev/null +++ b/mteb/tasks/PairClassification/ita/DisCoTexPairClassification.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.task_metadata import TaskMetadata + + +class DisCoTexPairClassification(AbsTaskPairClassification): + metadata = TaskMetadata( + name="DisCoTexPairClassification", + description="The DisCoTEX dataset aims at assessing discourse coherence in Italian texts. This dataset focuses on Italian real-world texts and provides resources to model coherence in natural language.", + reference="https://github.com/davidecolla/DisCoTex", + dataset={ + "path": "MattiaSangermano/DisCoTex-last-sentence", + "revision": "ab9ea43f8e54c8b24b12cd1b77d6eb462385a30b", + }, + type="PairClassification", + category="t2t", + modalities=["text"], + date=("2023-01-01", "2023-12-31"), + eval_splits=["test"], + eval_langs=["ita-Latn"], + main_score="max_ap", + domains=["Social", "Written"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{brunato2023discotex, + author = {Brunato, Dominique and Colla, Davide and Dell'Orletta, Felice and Dini, Irene and Radicioni, Daniele Paolo and Ravelli, Andrea Amelio and others}, + booktitle = {CEUR WORKSHOP PROCEEDINGS}, + organization = {CEUR}, + pages = {1--8}, + title = {DisCoTex at EVALITA 2023: overview of the assessing discourse coherence in Italian texts task}, + volume = {3473}, + year = {2023}, +} +""", + ) + + def dataset_transform(self): + self.dataset = self.dataset.remove_columns(["id", "source"]) + self.dataset = self.dataset.map( + lambda x: { + "prompt": [x["prompt"]], + "target": [x["target"]], + "class": [x["class"]], + }, + batched=True, + batch_size=len(self.dataset["train"]), + ) + self.dataset = self.dataset.rename_column("prompt", "sentence1") + self.dataset = self.dataset.rename_column("target", "sentence2") + self.dataset = self.dataset.rename_column("class", "labels") diff --git a/mteb/tasks/PairClassification/ita/__init__.py b/mteb/tasks/PairClassification/ita/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/Reranking/jpn/JQaRAReranking.py b/mteb/tasks/Reranking/jpn/JQaRAReranking.py new file mode 100644 index 0000000000..620abcce97 --- /dev/null +++ b/mteb/tasks/Reranking/jpn/JQaRAReranking.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +import datasets +from datasets import Dataset + +from mteb.abstasks.AbsTaskReranking import AbsTaskReranking +from mteb.abstasks.task_metadata import TaskMetadata + +_EVAL_SPLIT = "test" + + +class JQaRAReranking(AbsTaskReranking): + metadata = TaskMetadata( + name="JQaRAReranking", + description=( + "JQaRA: Japanese Question Answering with Retrieval Augmentation " + " - 検索拡張(RAG)評価のための日本語 Q&A データセット. JQaRA is an information retrieval task " + "for questions against 100 candidate data (including one or more correct answers)." + ), + reference="https://huggingface.co/datasets/hotchpotch/JQaRA", + dataset={ + "path": "sbintuitions/JMTEB", + "revision": "b194332dfb8476c7bdd0aaf80e2c4f2a0b4274c2", + "trust_remote_code": True, + }, + type="Reranking", + category="t2t", + modalities=["text"], + eval_splits=[_EVAL_SPLIT], + eval_langs=["jpn-Jpan"], + main_score="map", + date=("2020-01-01", "2024-12-31"), + domains=["Encyclopaedic", "Non-fiction", "Written"], + task_subtypes=["Question answering"], + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=["jpn-Jpan"], + sample_creation="found", + prompt="Given a Japanese question, rerank passages based on their relevance for answering the question", + bibtex_citation=r""" +@misc{yuichi-tateno-2024-jqara, + author = {Yuichi Tateno}, + title = {JQaRA: Japanese Question Answering with Retrieval Augmentation - 検索拡張(RAG)評価のための日本語Q&Aデータセット}, + url = {https://huggingface.co/datasets/hotchpotch/JQaRA}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + # Load queries + query_list = datasets.load_dataset( + name="jqara-query", + split=_EVAL_SPLIT, + **self.metadata.dataset, + ) + + # Load corpus + corpus_list = datasets.load_dataset( + name="jqara-corpus", + split="corpus", + **self.metadata.dataset, + ) + + # Create corpus mapping + corpus_map = {} + for row in corpus_list: + corpus_map[str(row["docid"])] = row["text"] + + # Transform data to RerankingEvaluator format + transformed_data = [] + for row in query_list: + query = row["query"] + retrieved_docs = row["retrieved_docs"] + relevance_scores = row["relevance_scores"] + + positive_docs = [] + negative_docs = [] + + for doc_id, score in zip(retrieved_docs, relevance_scores): + doc_text = corpus_map.get(str(doc_id), "") + if doc_text: # Only include documents that exist in corpus + if score == 1: + positive_docs.append(doc_text) + else: + negative_docs.append(doc_text) + + # Only include samples with both positive and negative documents + if positive_docs and negative_docs: + transformed_data.append( + { + "query": query, + "positive": positive_docs, + "negative": negative_docs, + } + ) + + # Convert to Dataset + self.dataset = {_EVAL_SPLIT: Dataset.from_list(transformed_data)} + self.dataset_transform() # do nothing + self.data_loaded = True diff --git a/mteb/tasks/Reranking/jpn/JaCWIRReranking.py b/mteb/tasks/Reranking/jpn/JaCWIRReranking.py new file mode 100644 index 0000000000..911515cef7 --- /dev/null +++ b/mteb/tasks/Reranking/jpn/JaCWIRReranking.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +import datasets +from datasets import Dataset + +from mteb.abstasks.AbsTaskReranking import AbsTaskReranking +from mteb.abstasks.task_metadata import TaskMetadata + +_EVAL_SPLIT = "test" + + +class JaCWIRReranking(AbsTaskReranking): + metadata = TaskMetadata( + name="JaCWIRReranking", + description=( + "JaCWIR is a small-scale Japanese information retrieval evaluation dataset consisting of " + "5000 question texts and approximately 500k web page titles and web page introductions or summaries " + "(meta descriptions, etc.). The question texts are created based on one of the 500k web pages, " + "and that data is used as a positive example for the question text." + ), + reference="https://huggingface.co/datasets/hotchpotch/JaCWIR", + dataset={ + "path": "sbintuitions/JMTEB", + "revision": "b194332dfb8476c7bdd0aaf80e2c4f2a0b4274c2", + "trust_remote_code": True, + }, + type="Reranking", + category="t2t", + modalities=["text"], + eval_splits=[_EVAL_SPLIT], + eval_langs=["jpn-Jpan"], + main_score="map", + date=("2020-01-01", "2024-12-31"), + domains=["Web", "Written"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{yuichi-tateno-2024-jacwir, + author = {Yuichi Tateno}, + title = {JaCWIR: Japanese Casual Web IR - 日本語情報検索評価のための小規模でカジュアルなWebタイトルと概要のデータセット}, + url = {https://huggingface.co/datasets/hotchpotch/JaCWIR}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + # Load queries + query_list = datasets.load_dataset( + name="jacwir-reranking-query", + split=_EVAL_SPLIT, + **self.metadata.dataset, + ) + + # Load corpus + corpus_list = datasets.load_dataset( + name="jacwir-reranking-corpus", + split="corpus", + **self.metadata.dataset, + ) + + # Create corpus mapping + corpus_map = {} + for row in corpus_list: + corpus_map[str(row["docid"])] = row["text"] + + # Transform data to RerankingEvaluator format + transformed_data = [] + for row in query_list: + query = row["query"] + retrieved_docs = row["retrieved_docs"] + relevance_scores = row["relevance_scores"] + + positive_docs = [] + negative_docs = [] + + for doc_id, score in zip(retrieved_docs, relevance_scores): + doc_text = corpus_map.get(str(doc_id), "") + if doc_text: # Only include documents that exist in corpus + if score == 1: + positive_docs.append(doc_text) + else: + negative_docs.append(doc_text) + + # Only include samples with both positive and negative documents + if positive_docs and negative_docs: + transformed_data.append( + { + "query": query, + "positive": positive_docs, + "negative": negative_docs, + } + ) + + # Convert to Dataset + self.dataset = {_EVAL_SPLIT: Dataset.from_list(transformed_data)} + self.dataset_transform() # do nothing + self.data_loaded = True diff --git a/mteb/tasks/Reranking/multilingual/XGlueWPRReranking.py b/mteb/tasks/Reranking/multilingual/XGlueWPRReranking.py new file mode 100644 index 0000000000..082a77debe --- /dev/null +++ b/mteb/tasks/Reranking/multilingual/XGlueWPRReranking.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +import logging + +import datasets +import pandas as pd + +from mteb.abstasks.AbsTaskReranking import AbsTaskReranking +from mteb.abstasks.task_metadata import TaskMetadata + +logger = logging.getLogger(__name__) + + +_LANGUAGES = { + "de": ["deu-Latn"], + "en": ["eng-Latn"], + "es": ["spa-Latn"], + "fr": ["fra-Latn"], + "it": ["ita-Latn"], + "pt": ["por-Latn"], + "zh": ["zho-Hans"], +} + +_CITATION = r""" +@misc{11234/1-3105, + author = {Zeman, Daniel and Nivre, Joakim and Abrams, Mitchell and Aepli, No{\"e}mi and Agi{\'c}, {\v Z}eljko and Ahrenberg, Lars and Aleksandravi{\v c}i{\=u}t{\.e}, Gabriel{\.e} and Antonsen, Lene and Aplonova, Katya and Aranzabe, Maria Jesus and Arutie, Gashaw and Asahara, Masayuki and Ateyah, Luma and Attia, Mohammed and Atutxa, Aitziber and Augustinus, Liesbeth and Badmaeva, Elena and Ballesteros, Miguel and Banerjee, Esha and Bank, Sebastian and Barbu Mititelu, Verginica and Basmov, Victoria and Batchelor, Colin and Bauer, John and Bellato, Sandra and Bengoetxea, Kepa and Berzak, Yevgeni and Bhat, Irshad Ahmad and Bhat, Riyaz Ahmad and Biagetti, Erica and Bick, Eckhard and Bielinskien{\.e}, Agn{\.e} and Blokland, Rogier and Bobicev, Victoria and Boizou, Lo{\"{\i}}c and Borges V{\"o}lker, Emanuel and B{\"o}rstell, Carl and Bosco, Cristina and Bouma, Gosse and Bowman, Sam and Boyd, Adriane and Brokait{\.e}, Kristina and Burchardt, Aljoscha and Candito, Marie and Caron, Bernard and Caron, Gauthier and Cavalcanti, Tatiana and Cebiro{\u g}lu Eryi{\u g}it, G{\"u}l{\c s}en and Cecchini, Flavio Massimiliano and Celano, Giuseppe G. A. and {\v C}{\'e}pl{\"o}, Slavom{\'{\i}}r and Cetin, Savas and Chalub, Fabricio and Choi, Jinho and Cho, Yongseok and Chun, Jayeol and Cignarella, Alessandra T. and Cinkov{\'a}, Silvie and Collomb, Aur{\'e}lie and {\c C}{\"o}ltekin, {\c C}a{\u g}r{\i} and Connor, Miriam and Courtin, Marine and Davidson, Elizabeth and de Marneffe, Marie-Catherine and de Paiva, Valeria and de Souza, Elvis and Diaz de Ilarraza, Arantza and Dickerson, Carly and Dione, Bamba and Dirix, Peter and Dobrovoljc, Kaja and Dozat, Timothy and Droganova, Kira and Dwivedi, Puneet and Eckhoff, Hanne and Eli, Marhaba and Elkahky, Ali and Ephrem, Binyam and Erina, Olga and Erjavec, Toma{\v z} and Etienne, Aline and Evelyn, Wograine and Farkas, Rich{\'a}rd and Fernandez Alcalde, Hector and Foster, Jennifer and Freitas, Cl{\'a}udia and Fujita, Kazunori and Gajdo{\v s}ov{\'a}, Katar{\'{\i}}na and Galbraith, Daniel and Garcia, Marcos and G{\"a}rdenfors, Moa and Garza, Sebastian and Gerdes, Kim and Ginter, Filip and Goenaga, Iakes and Gojenola, Koldo and G{\"o}k{\i}rmak, Memduh and Goldberg, Yoav and G{\'o}mez Guinovart, Xavier and Gonz{\'a}lez Saavedra, Berta and Grici{\=u}t{\.e}, Bernadeta and Grioni, Matias and Gr{\=u}z{\={\i}}tis, Normunds and Guillaume, Bruno and Guillot-Barbance, C{\'e}line and Habash, Nizar and Haji{\v c}, Jan and Haji{\v c} jr., Jan and H{\"a}m{\"a}l{\"a}inen, Mika and H{\`a} M{\~y}, Linh and Han, Na-Rae and Harris, Kim and Haug, Dag and Heinecke, Johannes and Hennig, Felix and Hladk{\'a}, Barbora and Hlav{\'a}{\v c}ov{\'a}, Jaroslava and Hociung, Florinel and Hohle, Petter and Hwang, Jena and Ikeda, Takumi and Ion, Radu and Irimia, Elena and Ishola, {\d O}l{\'a}j{\'{\i}}d{\'e} and Jel{\'{\i}}nek, Tom{\'a}{\v s} and Johannsen, Anders and J{\o}rgensen, Fredrik and Juutinen, Markus and Ka{\c s}{\i}kara, H{\"u}ner and Kaasen, Andre and Kabaeva, Nadezhda and Kahane, Sylvain and Kanayama, Hiroshi and Kanerva, Jenna and Katz, Boris and Kayadelen, Tolga and Kenney, Jessica and Kettnerov{\'a}, V{\'a}clava and Kirchner, Jesse and Klementieva, Elena and K{\"o}hn, Arne and Kopacewicz, Kamil and Kotsyba, Natalia and Kovalevskait{\.e}, Jolanta and Krek, Simon and Kwak, Sookyoung and Laippala, Veronika and Lambertino, Lorenzo and Lam, Lucia and Lando, Tatiana and Larasati, Septina Dian and Lavrentiev, Alexei and Lee, John and L{\^e} H{\`{\^o}}ng, Phương and Lenci, Alessandro and Lertpradit, Saran and Leung, Herman and Li, Cheuk Ying and Li, Josie and Li, Keying and Lim, {KyungTae} and Liovina, Maria and Li, Yuan and Ljube{\v s}i{\'c}, Nikola and Loginova, Olga and Lyashevskaya, Olga and Lynn, Teresa and Macketanz, Vivien and Makazhanov, Aibek and Mandl, Michael and Manning, Christopher and Manurung, Ruli and M{\u a}r{\u a}nduc, C{\u a}t{\u a}lina and Mare{\v c}ek, David and Marheinecke, Katrin and Mart{\'{\i}}nez Alonso, H{\'e}ctor and Martins, Andr{\'e} and Ma{\v s}ek, Jan and Matsumoto, Yuji and {McDonald}, Ryan and {McGuinness}, Sarah and Mendon{\c c}a, Gustavo and Miekka, Niko and Misirpashayeva, Margarita and Missil{\"a}, Anna and Mititelu, C{\u a}t{\u a}lin and Mitrofan, Maria and Miyao, Yusuke and Montemagni, Simonetta and More, Amir and Moreno Romero, Laura and Mori, Keiko Sophie and Morioka, Tomohiko and Mori, Shinsuke and Moro, Shigeki and Mortensen, Bjartur and Moskalevskyi, Bohdan and Muischnek, Kadri and Munro, Robert and Murawaki, Yugo and M{\"u}{\"u}risep, Kaili and Nainwani, Pinkey and Navarro Hor{\~n}iacek, Juan Ignacio and Nedoluzhko, Anna and Ne{\v s}pore-B{\=e}rzkalne, Gunta and Nguy{\~{\^e}}n Th{\d i}, Lương and Nguy{\~{\^e}}n Th{\d i} Minh, Huy{\`{\^e}}n and Nikaido, Yoshihiro and Nikolaev, Vitaly and Nitisaroj, Rattima and Nurmi, Hanna and Ojala, Stina and Ojha, Atul Kr. and Ol{\'u}{\`o}kun, Ad{\'e}day{\d o}̀ and Omura, Mai and Osenova, Petya and {\"O}stling, Robert and {\O}vrelid, Lilja and Partanen, Niko and Pascual, Elena and Passarotti, Marco and Patejuk, Agnieszka and Paulino-Passos, Guilherme and Peljak-{\L}api{\'n}ska, Angelika and Peng, Siyao and Perez, Cenel-Augusto and Perrier, Guy and Petrova, Daria and Petrov, Slav and Phelan, Jason and Piitulainen, Jussi and Pirinen, Tommi A and Pitler, Emily and Plank, Barbara and Poibeau, Thierry and Ponomareva, Larisa and Popel, Martin and Pretkalni{\c n}a, Lauma and Pr{\'e}vost, Sophie and Prokopidis, Prokopis and Przepi{\'o}rkowski, Adam and Puolakainen, Tiina and Pyysalo, Sampo and Qi, Peng and R{\"a}{\"a}bis, Andriela and Rademaker, Alexandre and Ramasamy, Loganathan and Rama, Taraka and Ramisch, Carlos and Ravishankar, Vinit and Real, Livy and Reddy, Siva and Rehm, Georg and Riabov, Ivan and Rie{\ss}ler, Michael and Rimkut{\.e}, Erika and Rinaldi, Larissa and Rituma, Laura and Rocha, Luisa and Romanenko, Mykhailo and Rosa, Rudolf and Rovati, Davide and Roșca, Valentin and Rudina, Olga and Rueter, Jack and Sadde, Shoval and Sagot, Beno{\^{\i}}t and Saleh, Shadi and Salomoni, Alessio and Samard{\v z}i{\'c}, Tanja and Samson, Stephanie and Sanguinetti, Manuela and S{\"a}rg, Dage and Saul{\={\i}}te, Baiba and Sawanakunanon, Yanin and Schneider, Nathan and Schuster, Sebastian and Seddah, Djam{\'e} and Seeker, Wolfgang and Seraji, Mojgan and Shen, Mo and Shimada, Atsuko and Shirasu, Hiroyuki and Shohibussirri, Muh and Sichinava, Dmitry and Silveira, Aline and Silveira, Natalia and Simi, Maria and Simionescu, Radu and Simk{\'o}, Katalin and {\v S}imkov{\'a}, M{\'a}ria and Simov, Kiril and Smith, Aaron and Soares-Bastos, Isabela and Spadine, Carolyn and Stella, Antonio and Straka, Milan and Strnadov{\'a}, Jana and Suhr, Alane and Sulubacak, Umut and Suzuki, Shingo and Sz{\'a}nt{\'o}, Zsolt and Taji, Dima and Takahashi, Yuta and Tamburini, Fabio and Tanaka, Takaaki and Tellier, Isabelle and Thomas, Guillaume and Torga, Liisi and Trosterud, Trond and Trukhina, Anna and Tsarfaty, Reut and Tyers, Francis and Uematsu, Sumire and Ure{\v s}ov{\'a}, Zde{\v n}ka and Uria, Larraitz and Uszkoreit, Hans and Utka, Andrius and Vajjala, Sowmya and van Niekerk, Daniel and van Noord, Gertjan and Varga, Viktor and Villemonte de la Clergerie, Eric and Vincze, Veronika and Wallin, Lars and Walsh, Abigail and Wang, Jing Xian and Washington, Jonathan North and Wendt, Maximilan and Williams, Seyi and Wir{\'e}n, Mats and Wittern, Christian and Woldemariam, Tsegay and Wong, Tak-sum and Wr{\'o}blewska, Alina and Yako, Mary and Yamazaki, Naoki and Yan, Chunxiao and Yasuoka, Koichi and Yavrumyan, Marat M. and Yu, Zhuoran and {\v Z}abokrtsk{\'y}, Zden{\v e}k and Zeldes, Amir and Zhang, Manying and Zhu, Hanzhi}, + copyright = {Licence Universal Dependencies v2.5}, + note = {{LINDAT}/{CLARIAH}-{CZ} digital library at the Institute of Formal and Applied Linguistics ({{\'U}FAL}), Faculty of Mathematics and Physics, Charles University}, + title = {Universal Dependencies 2.5}, + url = {http://hdl.handle.net/11234/1-3105}, + year = {2019}, +} + +@inproceedings{Conneau2018XNLIEC, + author = {Alexis Conneau and Guillaume Lample and Ruty Rinott and Adina Williams and Samuel R. Bowman and Holger Schwenk and Veselin Stoyanov}, + booktitle = {EMNLP}, + title = {XNLI: Evaluating Cross-lingual Sentence Representations}, + year = {2018}, +} + +@article{Lewis2019MLQAEC, + author = {Patrick Lewis and Barlas Oguz and Ruty Rinott and Sebastian Riedel and Holger Schwenk}, + journal = {ArXiv}, + title = {MLQA: Evaluating Cross-lingual Extractive Question Answering}, + volume = {abs/1910.07475}, + year = {2019}, +} + +@article{Liang2020XGLUEAN, + author = {Yaobo Liang and Nan Duan and Yeyun Gong and Ning Wu and Fenfei Guo and Weizhen Qi and Ming Gong and Linjun Shou and Daxin Jiang and Guihong Cao and Xiaodong Fan and Ruofei Zhang and Rahul Agrawal and Edward Cui and Sining Wei and Taroon Bharti and Ying Qiao and Jiun-Hung Chen and Winnie Wu and Shuguang Liu and Fan Yang and Daniel Campos and Rangan Majumder and Ming Zhou}, + journal = {arXiv}, + title = {XGLUE: A New Benchmark Dataset for Cross-lingual Pre-training, Understanding and Generation}, + volume = {abs/2004.01401}, + year = {2020}, +} + +@article{Sang2002IntroductionTT, + author = {Erik F. Tjong Kim Sang}, + journal = {ArXiv}, + title = {Introduction to the CoNLL-2002 Shared Task: Language-Independent Named Entity Recognition}, + volume = {cs.CL/0209010}, + year = {2002}, +} + +@article{Sang2003IntroductionTT, + author = {Erik F. Tjong Kim Sang and Fien De Meulder}, + journal = {ArXiv}, + title = {Introduction to the CoNLL-2003 Shared Task: Language-Independent Named Entity Recognition}, + volume = {cs.CL/0306050}, + year = {2003}, +} + +@article{Yang2019PAWSXAC, + author = {Yinfei Yang and Yuan Zhang and Chris Tar and Jason Baldridge}, + journal = {ArXiv}, + title = {PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification}, + volume = {abs/1908.11828}, + year = {2019}, +} +""" + + +class XGlueWPRReranking(AbsTaskReranking): + metadata = TaskMetadata( + name="XGlueWPRReranking", + description="""XGLUE is a new benchmark dataset to evaluate the performance of cross-lingual pre-trained models + with respect to cross-lingual natural language understanding and generation. XGLUE is composed of 11 tasks spans 19 languages.""", + reference="https://github.com/microsoft/XGLUE", + dataset={ + "path": "forresty/xglue", + "revision": "833b866f2f71a28d7251569020f0ff82ee5fdbbb", + "name": "wpr", + "trust_remote_code": True, + }, + type="Reranking", + category="t2t", + date=("2019-01-01", "2020-12-31"), + modalities=["text"], + eval_splits=["validation", "test"], + eval_langs=_LANGUAGES, + main_score="map", + domains=["Written"], + task_subtypes=[], + license="http://hdl.handle.net/11234/1-3105", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=_CITATION, + ) + + def load_data(self, **kwargs): + def _aggregate_texts(group, neg_label): + return pd.Series( + { + "positive": group[group["relavance_label"] != neg_label][ + "text" + ].tolist(), + "negative": group[group["relavance_label"] == neg_label][ + "text" + ].tolist(), + } + ) + + self.dataset = {} + for lang in self.hf_subsets: + ds = {} + for eval_split in self.metadata.eval_splits: + ds[eval_split] = datasets.load_dataset( + split=f"{eval_split}.{lang}", **self.metadata.dataset + ).map(lambda x: {"text": x["web_page_title"] + x["web_page_snippet"]}) + + neg_label = ds[eval_split].features["relavance_label"]._str2int["Bad"] + + grouped_df = ( + ds[eval_split] + .to_pandas() + .groupby("query") + .apply(_aggregate_texts, neg_label=neg_label) + .reset_index() + ) + + ds[eval_split] = datasets.Dataset.from_pandas(grouped_df) + + self.dataset[lang] = datasets.DatasetDict(ds) diff --git a/mteb/tasks/Retrieval/jpn/JaCWIRRetrieval.py b/mteb/tasks/Retrieval/jpn/JaCWIRRetrieval.py new file mode 100644 index 0000000000..1ece1eb89c --- /dev/null +++ b/mteb/tasks/Retrieval/jpn/JaCWIRRetrieval.py @@ -0,0 +1,80 @@ +from __future__ import annotations + +import datasets + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.task_metadata import TaskMetadata + +_EVAL_SPLIT = "test" + + +class JaCWIRRetrieval(AbsTaskRetrieval): + ignore_identical_ids = True + + metadata = TaskMetadata( + name="JaCWIRRetrieval", + description="""JaCWIR is a small-scale Japanese information retrieval evaluation dataset consisting of +5000 question texts and approximately 500k web page titles and web page introductions or summaries +(meta descriptions, etc.). The question texts are created based on one of the 500k web pages, +and that data is used as a positive example for the question text.""", + reference="https://huggingface.co/datasets/hotchpotch/JaCWIR", + dataset={ + "path": "sbintuitions/JMTEB", + "revision": "b194332dfb8476c7bdd0aaf80e2c4f2a0b4274c2", + "trust_remote_code": True, + }, + type="Retrieval", + category="t2t", + modalities=["text"], + eval_splits=[_EVAL_SPLIT], + eval_langs=["jpn-Jpan"], + main_score="ndcg_at_10", + date=("2000-01-01", "2024-12-31"), + domains=["Web", "Written"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{yuichi-tateno-2024-jacwir, + author = {Yuichi Tateno}, + title = {JaCWIR: Japanese Casual Web IR - 日本語情報検索評価のための小規模でカジュアルなWebタイトルと概要のデータセット}, + url = {https://huggingface.co/datasets/hotchpotch/JaCWIR}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + query_list = datasets.load_dataset( + name="jacwir-retrieval-query", + split=_EVAL_SPLIT, + **self.metadata.dataset, + ) + + queries = {} + qrels = {} + for row_id, row in enumerate(query_list): + queries[str(row_id)] = row["query"] + # Handle relevant_docs which should be a list + relevant_docs = row["relevant_docs"] + if not isinstance(relevant_docs, list): + relevant_docs = [relevant_docs] + qrels[str(row_id)] = {str(doc_id): 1 for doc_id in relevant_docs} + + corpus_list = datasets.load_dataset( + name="jacwir-retrieval-corpus", + split="corpus", + **self.metadata.dataset, + ) + + corpus = {str(row["docid"]): {"text": row["text"]} for row in corpus_list} + + self.corpus = {_EVAL_SPLIT: corpus} + self.queries = {_EVAL_SPLIT: queries} + self.relevant_docs = {_EVAL_SPLIT: qrels} + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/jpn/NLPJournalAbsArticleRetrieval.py b/mteb/tasks/Retrieval/jpn/NLPJournalAbsArticleRetrieval.py new file mode 100644 index 0000000000..62cff77c26 --- /dev/null +++ b/mteb/tasks/Retrieval/jpn/NLPJournalAbsArticleRetrieval.py @@ -0,0 +1,153 @@ +from __future__ import annotations + +import datasets + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.task_metadata import TaskMetadata + +_EVAL_SPLIT = "test" + + +class NLPJournalAbsArticleRetrievalV2(AbsTaskRetrieval): + ignore_identical_ids = True + + metadata = TaskMetadata( + name="NLPJournalAbsArticleRetrieval.V2", + description=( + "This dataset was created from the Japanese NLP Journal LaTeX Corpus. " + "The titles, abstracts and introductions of the academic papers were shuffled. " + "The goal is to find the corresponding full article with the given abstract. " + "This is the V2 dataset (last updated 2025-06-15)." + ), + reference="https://huggingface.co/datasets/sbintuitions/JMTEB", + dataset={ + "path": "sbintuitions/JMTEB", + "revision": "b194332dfb8476c7bdd0aaf80e2c4f2a0b4274c2", + "trust_remote_code": True, + "dataset_version": "v2", + }, + type="Retrieval", + category="t2c", + modalities=["text"], + eval_splits=[_EVAL_SPLIT], + eval_langs=["jpn-Jpan"], + main_score="ndcg_at_10", + date=("1994-10-10", "2025-06-15"), + domains=["Academic", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + adapted_from=["NLPJournalAbsArticleRetrieval"], + bibtex_citation=r""" +@misc{jmteb, + author = {Li, Shengzhe and Ohagi, Masaya and Ri, Ryokan}, + howpublished = {\url{https://huggingface.co/datasets/sbintuitions/JMTEB}}, + title = {{J}{M}{T}{E}{B}: {J}apanese {M}assive {T}ext {E}mbedding {B}enchmark}, + year = {2024}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + query_list = datasets.load_dataset( + name="nlp_journal_abs_article-query", + split=_EVAL_SPLIT, + **self.metadata.dataset, + ) + + queries = {} + qrels = {} + for row_id, row in enumerate(query_list): + queries[str(row_id)] = row["query"] + qrels[str(row_id)] = {str(row["relevant_docs"]): 1} + + corpus_list = datasets.load_dataset( + name="nlp_journal_abs_article-corpus", + split="corpus", + **self.metadata.dataset, + ) + + corpus = {str(row["docid"]): {"text": row["text"]} for row in corpus_list} + + self.corpus = {_EVAL_SPLIT: corpus} + self.queries = {_EVAL_SPLIT: queries} + self.relevant_docs = {_EVAL_SPLIT: qrels} + + self.data_loaded = True + + +class NLPJournalAbsArticleRetrieval(AbsTaskRetrieval): + ignore_identical_ids = True + + metadata = TaskMetadata( + name="NLPJournalAbsArticleRetrieval", + description=( + "This dataset was created from the Japanese NLP Journal LaTeX Corpus. " + "The titles, abstracts and introductions of the academic papers were shuffled. " + "The goal is to find the corresponding full article with the given abstract. " + "This is the V1 dataset (last updated 2020-06-15)." + ), + reference="https://huggingface.co/datasets/sbintuitions/JMTEB", + dataset={ + "path": "sbintuitions/JMTEB", + "revision": "b194332dfb8476c7bdd0aaf80e2c4f2a0b4274c2", + "trust_remote_code": True, + "dataset_version": "v1", + }, + type="Retrieval", + category="t2t", + modalities=["text"], + eval_splits=[_EVAL_SPLIT], + eval_langs=["jpn-Jpan"], + main_score="ndcg_at_10", + date=("1994-10-10", "2020-06-15"), + domains=["Academic", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{jmteb, + author = {Li, Shengzhe and Ohagi, Masaya and Ri, Ryokan}, + howpublished = {\url{https://huggingface.co/datasets/sbintuitions/JMTEB}}, + title = {{J}{M}{T}{E}{B}: {J}apanese {M}assive {T}ext {E}mbedding {B}enchmark}, + year = {2024}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + query_list = datasets.load_dataset( + name="nlp_journal_abs_article-query", + split=_EVAL_SPLIT, + **self.metadata.dataset, + ) + + queries = {} + qrels = {} + for row_id, row in enumerate(query_list): + queries[str(row_id)] = row["query"] + qrels[str(row_id)] = {str(row["relevant_docs"]): 1} + + corpus_list = datasets.load_dataset( + name="nlp_journal_abs_article-corpus", + split="corpus", + **self.metadata.dataset, + ) + + corpus = {str(row["docid"]): {"text": row["text"]} for row in corpus_list} + + self.corpus = {_EVAL_SPLIT: corpus} + self.queries = {_EVAL_SPLIT: queries} + self.relevant_docs = {_EVAL_SPLIT: qrels} + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/jpn/NLPJournalAbsIntroRetrieval.py b/mteb/tasks/Retrieval/jpn/NLPJournalAbsIntroRetrieval.py index dfad47caba..c335dea8f4 100644 --- a/mteb/tasks/Retrieval/jpn/NLPJournalAbsIntroRetrieval.py +++ b/mteb/tasks/Retrieval/jpn/NLPJournalAbsIntroRetrieval.py @@ -8,15 +8,90 @@ _EVAL_SPLIT = "test" +class NLPJournalAbsIntroRetrievalV2(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NLPJournalAbsIntroRetrieval.V2", + description=( + "This dataset was created from the Japanese NLP Journal LaTeX Corpus. " + "The titles, abstracts and introductions of the academic papers were shuffled. " + "The goal is to find the corresponding introduction with the given abstract. " + "This is the V2 dataset (last update 2025-06-15)." + ), + reference="https://huggingface.co/datasets/sbintuitions/JMTEB", + dataset={ + "path": "sbintuitions/JMTEB", + "revision": "b194332dfb8476c7bdd0aaf80e2c4f2a0b4274c2", + "trust_remote_code": True, + "dataset_version": "v2", + }, + type="Retrieval", + category="t2t", + modalities=["text"], + eval_splits=[_EVAL_SPLIT], + eval_langs=["jpn-Jpan"], + main_score="ndcg_at_10", + date=("1994-10-10", "2020-06-15"), + domains=["Academic", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + adapted_from=["NLPJournalAbsIntroRetrieval"], + bibtex_citation=r""" +@misc{jmteb, + author = {Li, Shengzhe and Ohagi, Masaya and Ri, Ryokan}, + howpublished = {\url{https://huggingface.co/datasets/sbintuitions/JMTEB}}, + title = {{J}{M}{T}{E}{B}: {J}apanese {M}assive {T}ext {E}mbedding {B}enchmark}, + year = {2024}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + query_list = datasets.load_dataset( + name="nlp_journal_abs_intro-query", + split=_EVAL_SPLIT, + **self.metadata.dataset, + ) + + queries = {} + qrels = {} + for row_id, row in enumerate(query_list): + queries[str(row_id)] = row["query"] + qrels[str(row_id)] = {str(row["relevant_docs"]): 1} + + corpus_list = datasets.load_dataset( + name="nlp_journal_abs_intro-corpus", split="corpus", **self.metadata.dataset + ) + + corpus = {str(row["docid"]): {"text": row["text"]} for row in corpus_list} + + self.corpus = {_EVAL_SPLIT: corpus} + self.queries = {_EVAL_SPLIT: queries} + self.relevant_docs = {_EVAL_SPLIT: qrels} + + self.data_loaded = True + + class NLPJournalAbsIntroRetrieval(AbsTaskRetrieval): metadata = TaskMetadata( name="NLPJournalAbsIntroRetrieval", - description="This dataset was created from the Japanese NLP Journal LaTeX Corpus. The titles, abstracts and introductions of the academic papers were shuffled. The goal is to find the corresponding introduction with the given abstract.", - reference="https://github.com/sbintuitions/JMTEB", + description=( + "This dataset was created from the Japanese NLP Journal LaTeX Corpus. " + "The titles, abstracts and introductions of the academic papers were shuffled. " + "The goal is to find the corresponding introduction with the given abstract. " + "This is the V1 dataset (last update 2020-06-15)." + ), + reference="https://huggingface.co/datasets/sbintuitions/JMTEB", dataset={ "path": "sbintuitions/JMTEB", - "revision": "e4af6c73182bebb41d94cb336846e5a452454ea7", + "revision": "b194332dfb8476c7bdd0aaf80e2c4f2a0b4274c2", "trust_remote_code": True, + "dataset_version": "v1", }, type="Retrieval", category="t2t", @@ -24,14 +99,21 @@ class NLPJournalAbsIntroRetrieval(AbsTaskRetrieval): eval_splits=[_EVAL_SPLIT], eval_langs=["jpn-Jpan"], main_score="ndcg_at_10", - date=("2000-01-01", "2023-12-31"), + date=("1994-10-10", "2020-06-15"), domains=["Academic", "Written"], - task_subtypes=[], + task_subtypes=["Article retrieval"], license="cc-by-4.0", annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="", + bibtex_citation=r""" +@misc{jmteb, + author = {Li, Shengzhe and Ohagi, Masaya and Ri, Ryokan}, + howpublished = {\url{https://huggingface.co/datasets/sbintuitions/JMTEB}}, + title = {{J}{M}{T}{E}{B}: {J}apanese {M}assive {T}ext {E}mbedding {B}enchmark}, + year = {2024}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/jpn/NLPJournalTitleAbsRetrieval.py b/mteb/tasks/Retrieval/jpn/NLPJournalTitleAbsRetrieval.py index b907900ce6..ff22467273 100644 --- a/mteb/tasks/Retrieval/jpn/NLPJournalTitleAbsRetrieval.py +++ b/mteb/tasks/Retrieval/jpn/NLPJournalTitleAbsRetrieval.py @@ -8,15 +8,92 @@ _EVAL_SPLIT = "test" +class NLPJournalTitleAbsRetrievalV2(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NLPJournalTitleAbsRetrieval.V2", + description=( + "This dataset was created from the Japanese NLP Journal LaTeX Corpus. " + "The titles, abstracts and introductions of the academic papers were shuffled. " + "The goal is to find the corresponding abstract with the given title. " + "This is the V2 dataset (last updated 2025-06-15)." + ), + reference="https://huggingface.co/datasets/sbintuitions/JMTEB", + dataset={ + "path": "sbintuitions/JMTEB", + "revision": "b194332dfb8476c7bdd0aaf80e2c4f2a0b4274c2", + "trust_remote_code": True, + "dataset_version": "v2", + }, + type="Retrieval", + category="t2t", + modalities=["text"], + eval_splits=[_EVAL_SPLIT], + eval_langs=["jpn-Jpan"], + main_score="ndcg_at_10", + date=("1994-10-10", "2025-06-15"), + domains=["Academic", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + adapted_from=["NLPJournalTitleAbsRetrieval"], + bibtex_citation=r""" +@misc{jmteb, + author = {Li, Shengzhe and Ohagi, Masaya and Ri, Ryokan}, + howpublished = {\url{https://huggingface.co/datasets/sbintuitions/JMTEB}}, + title = {{J}{M}{T}{E}{B}: {J}apanese {M}assive {T}ext {E}mbedding {B}enchmark}, + year = {2024}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + query_list = datasets.load_dataset( + name="nlp_journal_title_abs-query", + split=_EVAL_SPLIT, + **self.metadata.dataset, + ) + + queries = {} + qrels = {} + for row_id, row in enumerate(query_list): + queries[str(row_id)] = row["query"] + qrels[str(row_id)] = {str(row["relevant_docs"]): 1} + + corpus_list = datasets.load_dataset( + name="nlp_journal_title_abs-corpus", + split="corpus", + **self.metadata.dataset, + ) + + corpus = {str(row["docid"]): {"text": row["text"]} for row in corpus_list} + + self.corpus = {_EVAL_SPLIT: corpus} + self.queries = {_EVAL_SPLIT: queries} + self.relevant_docs = {_EVAL_SPLIT: qrels} + + self.data_loaded = True + + class NLPJournalTitleAbsRetrieval(AbsTaskRetrieval): metadata = TaskMetadata( name="NLPJournalTitleAbsRetrieval", - description="This dataset was created from the Japanese NLP Journal LaTeX Corpus. The titles, abstracts and introductions of the academic papers were shuffled. The goal is to find the corresponding abstract with the given title.", - reference="https://github.com/sbintuitions/JMTEB", + description=( + "This dataset was created from the Japanese NLP Journal LaTeX Corpus. " + "The titles, abstracts and introductions of the academic papers were shuffled. " + "The goal is to find the corresponding abstract with the given title. " + "This is the V1 dataset (last updated 2020-06-15)." + ), + reference="https://huggingface.co/datasets/sbintuitions/JMTEB", dataset={ "path": "sbintuitions/JMTEB", - "revision": "e4af6c73182bebb41d94cb336846e5a452454ea7", + "revision": "b194332dfb8476c7bdd0aaf80e2c4f2a0b4274c2", "trust_remote_code": True, + "dataset_version": "v1", }, type="Retrieval", category="t2t", @@ -24,14 +101,21 @@ class NLPJournalTitleAbsRetrieval(AbsTaskRetrieval): eval_splits=[_EVAL_SPLIT], eval_langs=["jpn-Jpan"], main_score="ndcg_at_10", - date=("2000-01-01", "2023-12-31"), + date=("1994-10-10", "2020-06-15"), domains=["Academic", "Written"], - task_subtypes=[], + task_subtypes=["Article retrieval"], license="cc-by-4.0", annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="", + bibtex_citation=r""" +@misc{jmteb, + author = {Li, Shengzhe and Ohagi, Masaya and Ri, Ryokan}, + howpublished = {\url{https://huggingface.co/datasets/sbintuitions/JMTEB}}, + title = {{J}{M}{T}{E}{B}: {J}apanese {M}assive {T}ext {E}mbedding {B}enchmark}, + year = {2024}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/jpn/NLPJournalTitleIntroRetrieval.py b/mteb/tasks/Retrieval/jpn/NLPJournalTitleIntroRetrieval.py index f0937e5ddd..a5d6946e45 100644 --- a/mteb/tasks/Retrieval/jpn/NLPJournalTitleIntroRetrieval.py +++ b/mteb/tasks/Retrieval/jpn/NLPJournalTitleIntroRetrieval.py @@ -8,15 +8,92 @@ _EVAL_SPLIT = "test" +class NLPJournalTitleIntroRetrievalV2(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NLPJournalTitleIntroRetrieval.V2", + description=( + "This dataset was created from the Japanese NLP Journal LaTeX Corpus. " + "The titles, abstracts and introductions of the academic papers were shuffled. " + "The goal is to find the corresponding introduction with the given title. " + "This is the V2 dataset (last updated 2025-06-15)." + ), + reference="https://huggingface.co/datasets/sbintuitions/JMTEB", + dataset={ + "path": "sbintuitions/JMTEB", + "revision": "b194332dfb8476c7bdd0aaf80e2c4f2a0b4274c2", + "trust_remote_code": True, + "dataset_version": "v2", + }, + type="Retrieval", + category="t2t", + modalities=["text"], + eval_splits=[_EVAL_SPLIT], + eval_langs=["jpn-Jpan"], + main_score="ndcg_at_10", + date=("1994-10-10", "2025-06-15"), + domains=["Academic", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + adapted_from=["NLPJournalTitleIntroRetrieval"], + bibtex_citation=r""" +@misc{jmteb, + author = {Li, Shengzhe and Ohagi, Masaya and Ri, Ryokan}, + howpublished = {\url{https://huggingface.co/datasets/sbintuitions/JMTEB}}, + title = {{J}{M}{T}{E}{B}: {J}apanese {M}assive {T}ext {E}mbedding {B}enchmark}, + year = {2024}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + query_list = datasets.load_dataset( + name="nlp_journal_title_intro-query", + split=_EVAL_SPLIT, + **self.metadata.dataset, + ) + + queries = {} + qrels = {} + for row_id, row in enumerate(query_list): + queries[str(row_id)] = row["query"] + qrels[str(row_id)] = {str(row["relevant_docs"]): 1} + + corpus_list = datasets.load_dataset( + name="nlp_journal_title_intro-corpus", + split="corpus", + **self.metadata.dataset, + ) + + corpus = {str(row["docid"]): {"text": row["text"]} for row in corpus_list} + + self.corpus = {_EVAL_SPLIT: corpus} + self.queries = {_EVAL_SPLIT: queries} + self.relevant_docs = {_EVAL_SPLIT: qrels} + + self.data_loaded = True + + class NLPJournalTitleIntroRetrieval(AbsTaskRetrieval): metadata = TaskMetadata( name="NLPJournalTitleIntroRetrieval", - description="This dataset was created from the Japanese NLP Journal LaTeX Corpus. The titles, abstracts and introductions of the academic papers were shuffled. The goal is to find the corresponding introduction with the given title.", - reference="https://github.com/sbintuitions/JMTEB", + description=( + "This dataset was created from the Japanese NLP Journal LaTeX Corpus. " + "The titles, abstracts and introductions of the academic papers were shuffled. " + "The goal is to find the corresponding introduction with the given title. " + "This is the V1 dataset (last updated 2020-06-15)." + ), + reference="https://huggingface.co/datasets/sbintuitions/JMTEB", dataset={ "path": "sbintuitions/JMTEB", - "revision": "e4af6c73182bebb41d94cb336846e5a452454ea7", + "revision": "b194332dfb8476c7bdd0aaf80e2c4f2a0b4274c2", "trust_remote_code": True, + "dataset_version": "v1", }, type="Retrieval", category="t2t", @@ -24,14 +101,21 @@ class NLPJournalTitleIntroRetrieval(AbsTaskRetrieval): eval_splits=[_EVAL_SPLIT], eval_langs=["jpn-Jpan"], main_score="ndcg_at_10", - date=("2000-01-01", "2023-12-31"), + date=("1994-10-10", "2020-06-15"), domains=["Academic", "Written"], - task_subtypes=[], + task_subtypes=["Article retrieval"], license="cc-by-4.0", annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="", + bibtex_citation=r""" +@misc{jmteb, + author = {Li, Shengzhe and Ohagi, Masaya and Ri, Ryokan}, + howpublished = {\url{https://huggingface.co/datasets/sbintuitions/JMTEB}}, + title = {{J}{M}{T}{E}{B}: {J}apanese {M}assive {T}ext {E}mbedding {B}enchmark}, + year = {2024}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/multilingual/MKQARetrieval.py b/mteb/tasks/Retrieval/multilingual/MKQARetrieval.py new file mode 100644 index 0000000000..30f0f316d5 --- /dev/null +++ b/mteb/tasks/Retrieval/multilingual/MKQARetrieval.py @@ -0,0 +1,133 @@ +from __future__ import annotations + +import datasets + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.task_metadata import TaskMetadata + +_LANGUAGE_MAPPING = { + "ar": "ara-Arab", + "da": "dan-Latn", + "de": "deu-Latn", + "en": "eng-Latn", + "es": "spa-Latn", + "fi": "fin-Latn", + "fr": "fra-Latn", + "he": "heb-Hebr", + "hu": "hun-Latn", + "it": "ita-Latn", + "ja": "jpn-Jpan", + "ko": "kor-Kore", + "km": "khm-Khmr", + "ms": "msa-Latn", + "nl": "nld-Latn", + "no": "nor-Latn", + "pl": "pol-Latn", + "pt": "por-Latn", + "ru": "rus-Cyrl", + "sv": "swe-Latn", + "th": "tha-Thai", + "tr": "tur-Latn", + "vi": "vie-Latn", + "zh_cn": "zho-Hans", + "zh_hk": "zho-Hant", + "zh_tw": "zho-Hant", +} + + +_EVAL_LANGS = {k: [v] for k, v in _LANGUAGE_MAPPING.items()} + + +class MKQARetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="MKQARetrieval", + description="""Multilingual Knowledge Questions & Answers (MKQA)contains 10,000 queries sampled from the Google Natural Questions dataset. + For each query we collect new passage-independent answers. These queries and answers are then human translated into 25 Non-English languages.""", + reference="https://github.com/apple/ml-mkqa", + dataset={ + "path": "apple/mkqa", + "revision": "325131889721ae0ed885b76ecb8011369d75abad", + "trust_remote_code": True, + "name": "mkqa", + }, + type="Retrieval", + category="t2t", + modalities=["text"], + date=("2020-01-01", "2020-12-31"), + eval_splits=["train"], + eval_langs=_EVAL_LANGS, + main_score="ndcg_at_10", + domains=["Written"], + task_subtypes=["Question answering"], + license="cc-by-3.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{mkqa, + author = {Shayne Longpre and Yi Lu and Joachim Daiber}, + title = {MKQA: A Linguistically Diverse Benchmark for Multilingual Open Domain Question Answering}, + url = {https://arxiv.org/pdf/2007.15207.pdf}, + year = {2020}, +} + """, + ) + + def load_data(self, **kwargs): + """In this retrieval dataset, corpus and queries are in the same language.""" + if self.data_loaded: + return + + self.queries, self.corpus, self.relevant_docs = {}, {}, {} + + ds = datasets.load_dataset( + **self.metadata.dataset, + ) + + for lang in self.hf_subsets: + self.queries[lang] = {} + self.corpus[lang] = {} + self.relevant_docs[lang] = {} + + for eval_split in self.metadata.eval_splits: + self.queries[lang][eval_split] = {} + self.corpus[lang][eval_split] = {} + self.relevant_docs[lang][eval_split] = {} + + split_data = ds[eval_split] + + query_ids = { + query: f"Q{i}" + for i, query in enumerate( + {entry[lang] for entry in split_data["queries"]} + ) + } + + context_texts = { + hit["text"] + for entry in split_data["answers"] + for hit in entry[lang] + } + + context_ids = {text: f"C{i}" for i, text in enumerate(context_texts)} + + for row in split_data: + query = row["queries"][lang] + contexts = [entry["text"] for entry in row["answers"][lang]] + + if query is None or None in contexts: + continue + + query_id = query_ids[query] + for context in contexts: + context_id = context_ids[context] + self.queries[lang][eval_split][query_id] = query + self.corpus[lang][eval_split][context_id] = { + "title": "", + "text": context, + } + if query_id not in self.relevant_docs[lang][eval_split]: + self.relevant_docs[lang][eval_split][query_id] = {} + self.relevant_docs[lang][eval_split][query_id][context_id] = 1 + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/multilingual/WebFAQRetrieval.py b/mteb/tasks/Retrieval/multilingual/WebFAQRetrieval.py index aa862a20ac..93283bb455 100644 --- a/mteb/tasks/Retrieval/multilingual/WebFAQRetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/WebFAQRetrieval.py @@ -164,11 +164,11 @@ def load_data(self, **kwargs): return self.corpus, self.queries, self.relevant_docs = _load_webfaq_data( - path=self.metadata_dict["dataset"]["path"], + path=self.metadata.dataset["path"], langs=self.hf_subsets, - splits=self.metadata_dict["eval_splits"], + splits=_EVAL_SPLIT, cache_dir=kwargs.get("cache_dir", None), - revision=self.metadata_dict["dataset"]["revision"], + revision=self.metadata.dataset["revision"], ) self.data_loaded = True diff --git a/mteb/tasks/Retrieval/nob/snl_retrieval.py b/mteb/tasks/Retrieval/nob/snl_retrieval.py index 6c22567384..5896dfdc36 100644 --- a/mteb/tasks/Retrieval/nob/snl_retrieval.py +++ b/mteb/tasks/Retrieval/nob/snl_retrieval.py @@ -10,11 +10,11 @@ class SNLRetrieval(AbsTaskRetrieval): metadata = TaskMetadata( name="SNLRetrieval", dataset={ - "path": "navjordj/SNL_summarization", + "path": "adrlau/navjordj-SNL_summarization_copy", # TODO: replace with mteb/SNLRetrieval after #2820 is resolved. "revision": "3d3d27aa7af8941408cefc3991ada5d12a4273d1", }, description="Webscrabed articles and ingresses from the Norwegian lexicon 'Det Store Norske Leksikon'.", - reference="https://huggingface.co/datasets/navjordj/SNL_summarization", + reference="https://huggingface.co/datasets/mteb/SNLRetrieval", type="Retrieval", category="t2t", modalities=["text"], diff --git a/pyproject.toml b/pyproject.toml index 0bb62b9bfe..35ea6778f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.38.29" +version = "1.38.34" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ @@ -26,7 +26,7 @@ classifiers = [ ] requires-python = ">=3.9,<3.13" dependencies = [ - "datasets>=2.19.0", + "datasets>=2.19.0, <4.0.0", "numpy>=1.0.0,<3.0.0", "requests>=2.26.0", "scikit-learn>=1.4.0", @@ -66,14 +66,13 @@ dev = [ "bibtexparser>=1.4.3" # used for tests/test_citation_formatting.py ] docs = [ - "mkdocs>=1.6.1", - "mkdocs-material>=9.5.47", - "mkdocstrings[python]>=0.18", - "mkdocs-bibtex>=2.16.2", + "mkdocs>=1.6.1", + "mkdocs-material>=9.5.47", + "mkdocstrings[python]>=0.18", + "mkdocs-bibtex>=2.16.2", ] leaderboard = [ - "gradio==5.27.1; python_version > '3.9'", # 3.10 is required for gradio - "gradio_rangeslider>=0.0.8", + "gradio==5.35.0; python_version > '3.9'", # 3.10 is required for gradio "plotly>=5.24.0,<6.0.0", "cachetools>=5.2.0", "matplotlib>=3.9.4", @@ -83,6 +82,7 @@ leaderboard = [ peft = ["peft>=0.11.0"] flagembedding = ["FlagEmbedding==1.3.4"] jina = ["einops>=0.8.0"] +jina-v4 = ["peft>=0.15.2", "transformers>=4.52.0", "torchvision>=0.22.1"] flash_attention = ["flash-attn>=2.6.3"] openai = ["openai>=1.41.0", "tiktoken>=0.8.0"] model2vec = ["model2vec>=0.3.0"] @@ -101,6 +101,7 @@ open_clip_torch = ["open_clip_torch==2.31.0"] nomic = ["einops>=0.8.1"] ark = ["volcengine-python-sdk[ark]==3.0.2", "tiktoken>=0.8.0"] colpali_engine = ["colpali_engine>=0.3.10"] +xet = ["huggingface_hub>=0.32.0"] [tool.coverage.report] @@ -132,6 +133,7 @@ namespaces = false [tool.setuptools.package-data] "*" = ["*.json"] "mteb.abstasks" = ["dataset_card_template.md"] +"mteb.tasks.Image.ZeroShotClassification.eng.templates" = ["*.txt"] [tool.ruff] diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py index 7f0d40deec..d61cc684be 100644 --- a/tests/test_benchmark/test_benchmark.py +++ b/tests/test_benchmark/test_benchmark.py @@ -341,6 +341,16 @@ def encode( ) +@pytest.mark.parametrize("task_name", ["NQ-NL-query", "NQ-NL-passage"]) +def test_prompt_name_split_correctly(task_name: str, tmp_path: Path): + """Test that the task name is split correctly into task name and prompt type + for tasks with multiple `-` in their names. + """ + mock_encocder = AbsMockEncoder() + mock_encocder.prompts = {task_name: task_name} + mock_encocder.validate_task_to_prompt_name() + + @pytest.mark.parametrize( "task", [ diff --git a/tests/test_evaluators/test_RetrievalEvaluator.py b/tests/test_evaluators/test_RetrievalEvaluator.py index 4431389c7d..8115f4c3d4 100644 --- a/tests/test_evaluators/test_RetrievalEvaluator.py +++ b/tests/test_evaluators/test_RetrievalEvaluator.py @@ -2,7 +2,7 @@ import pytest -from mteb import TaskMetadata +from mteb.abstasks.task_metadata import TaskMetadata from mteb.evaluation.evaluators import RetrievalEvaluator from tests.test_benchmark.mock_tasks import general_args diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py index 7d61d50553..db7071a89a 100644 --- a/tests/test_tasks/test_all_abstasks.py +++ b/tests/test_tasks/test_all_abstasks.py @@ -35,10 +35,6 @@ datasets_not_available = [ "AfriSentiLangClassification", - "SNLHierarchicalClusteringP2P", - "SNLClustering", - "SNLHierarchicalClusteringS2S", - "SNLRetrieval", ]