diff --git a/README.md b/README.md
index 59cc5da9e2..32e95932e0 100644
--- a/README.md
+++ b/README.md
@@ -483,6 +483,7 @@ evaluation.run(model, ...)
| 👩💻 [Adding a benchmark] | How to add a new benchmark to MTEB and to the leaderboard |
| 🤝 [Contributing] | How to contribute to MTEB and set it up for development |
| 🌐 [MMTEB] | An open-source effort to extend MTEB to cover a broad set of languages |
+| 🖼️ [MIEB] | Extension of MTEB to image embeddings |
[Tasks]: docs/tasks.md
[Benchmarks]: docs/benchmarks.md
@@ -492,6 +493,7 @@ evaluation.run(model, ...)
[Adding a benchmark]: docs/adding_a_benchmark.md
[Leaderboard]: https://huggingface.co/spaces/mteb/leaderboard
[MMTEB]: docs/mmteb/readme.md
+[MIEB]: docs/mieb.md
[Reproducible workflows]: docs/reproducible_workflow.md
## Citing
diff --git a/docs/mieb.md b/docs/mieb.md
new file mode 100644
index 0000000000..e2135730d5
--- /dev/null
+++ b/docs/mieb.md
@@ -0,0 +1,116 @@
+# Welcome to MIEB! 👋
+
+The Massive Image Embedding Benchmark (MIEB) is an image extension of [MTEB](https://arxiv.org/abs/2210.07316) to cover embedding tasks for image-text tasks.
+
+## 🌱 Background
+
+MIEB intends to extend MTEB and MMTEB to cover image representation learning and image-text alignment tasks.
+
+## 🪴 Contributing to MIEB
+
+The FIRST step is to _always_ create an issue in the MTEB repo (this one), and add the `mieb` label. PRs without issues will not be accepted.
+
+There are a few ways for anyone to contribute to MIEB:
+
+ 1. Add a dataset as an existing task type. This means that the `AbsTask` already exists, e.g. `AbsTaskImageClassification`, and the effort is solely in adding an instance of it.
+ 2. Add a model. This could mean either: a) The model wrapper, e.g. `OpenCLIPWrapper`, already exists, and the effort is solely in adding a filled out `ModelMeta` object, and/or b) Add a new model wrapper.
+ 3. Add a new task type. This means that the existing task types do not cover this new task. An accompanying evaluator should also be implemented.
+
+Let's go through an example.
+
+## Example
+
+Here is an example implementing a zero-shot image classification from scratch. Let's say we wish to implement CIFAR10 as a task and evaluate an OpenCLIP model on it.
+
+To solve this task, we need to encode the `images`, encode the `class label candidates with prompts` (e.g. "this is a dog pic", "this is a cat pic"), and compare them by calculating similarity, and then argmax out the class prediction for each image. We begin by implementing a model wrapper.
+
+### Model Wrapper
+See the [`ImageEncoder` class](https://github.com/embeddings-benchmark/mteb/blob/mieb/mteb/encoder_interface.py) for more details. The model class implements `get_text_embeddings`, `get_image_embeddings`, and `calculate_probs` methods.
+As an example, [`OpenCLIPWrapper`](https://github.com/embeddings-benchmark/mteb/blob/mieb/mteb/models/openclip_models.py) is first implemented, with metadata defined below.
+```python
+class OpenCLIPWrapper:
+ ...
+```
+See also [adding a model](adding_a_model.md) for reference.
+
+### X Evaluator
+With the model, [ZeroshotClassificationEvaluator](https://github.com/embeddings-benchmark/mteb/blob/mieb/mteb/evaluation/evaluators/Image/ZeroshotClassificationEvaluator.py) is implemented here. This defines how the model are used to do zero-shot classification and get back results on desired metrics.
+```python
+class ZeroshotClassificationEvaluator(Evaluator):
+ def __init__(self, ...):
+ ...
+ def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}):
+ """Get embeddings and calculate scores."""
+ ...
+```
+
+### AbsTask X
+With the evaluator, [AbsTaskZeroshotClassification](https://github.com/embeddings-benchmark/mteb/blob/mieb/mteb/abstasks/Image/AbsTaskZeroshotClassification.py) is defined, operating on the dataset, calling the defined Evaluator, and gives out results.
+```python
+class AbsTaskZeroshotClassification(AbsTask):
+ ...
+```
+
+
+### Dataset class
+With all these, we can then define the dataset. [CIFAR10](https://github.com/embeddings-benchmark/mteb/blob/mieb/mteb/tasks/Image/ZeroshotClassification/eng/CIFAR.py) is implemented like this, subclassing `AbsTaskZeroshotClassification`, and overwrite the `get_candidate_labels` function, which gives `["a photo of {label_name}"]` to be used in the evaluator.
+```python
+class CIFAR10ZeroShotClassification(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(...)
+
+ def get_candidate_labels(self) -> list[str]:
+ ...
+```
+See also [adding a dataset](adding_a_dataset.md) for reference.
+
+### Putting them all together
+With all these, we can then
+```python
+import mteb
+
+model_name = "laion/CLIP-ViT-L-14-laion2B-s32B-b82K"
+model = mteb.get_model(model_name=model_name)
+
+tasks = mteb.get_tasks(tasks=["CIFAR10ZeroShot"])
+evaluation = mteb.MTEB(tasks=tasks)
+results = evaluation.run(model)
+```
+
+By default, results will be under `results/laion__CLIP-ViT-L-14-laion2B-s32B-b82K/REVISION/CIFAR10ZeroShot.json`. Sometimes metrics can be a bit different than what the original paper claimed. This might be due to the resolution/layout difference of images in the remake of the dataset.
+
+
+## Specific Model running Instructions
+
+Some models require some specific steps before running. Those are collected here.
+
+
+ Vista
+
+ ## set up VISTA
+
+ ```
+ git clone https://github.com/FlagOpen/FlagEmbedding.git
+ cd FlagEmbedding/research/visual_bge
+ pip install -e .
+ pip install torchvision timm einops ftfy
+ ```
+ back to the root folder of mteb; download the vision tower for bge-base
+ ```
+ cd ..
+ wget https://huggingface.co/BAAI/bge-visualized/resolve/main/Visualized_base_en_v1.5.pth?download=true
+ ```
+ rename it to `visualized_base_en_V1.5.pth`
+ ```
+ mv Visualized_base_en_v1.5.pth?download=true visualized_base_en_V1.5.pth
+ ```
+ download the vision tower for bge-m3
+ ```
+ wget https://huggingface.co/BAAI/bge-visualized/resolve/main/Visualized_m3.pth?download=true
+ ```
+ rename it to `visualized_m3.pth`
+ ```
+ mv Visualized_m3.pth?download=true visualized_m3.pth
+ ```
+
+
+
\ No newline at end of file
diff --git a/docs/tasks.md b/docs/tasks.md
index 20be22bee2..bc9a4e99a4 100644
--- a/docs/tasks.md
+++ b/docs/tasks.md
@@ -12,6 +12,10 @@ The following tables give you an overview of the tasks in MTEB.
| [AILAStatutes](https://zenodo.org/records/4063986) | ['eng'] | Retrieval | p2p | [Legal, Written] | None | None |
| [AJGT](https://link.springer.com/chapter/10.1007/978-3-319-60042-0_66/) (Alomari et al., 2017) | ['ara'] | Classification | s2s | [Social, Written] | None | None |
| [ARCChallenge](https://allenai.org/data/arc) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None |
+| [AROCocoOrder](https://proceedings.neurips.cc/paper_files/paper/2023/hash/63461de0b4cb760fc498e85b18a7fe81-Abstract-Datasets_and_Benchmarks.html) (Hsieh et al., 2024) | ['eng'] | ImageTextPairClassification | i2t | [Encyclopaedic] | None | None |
+| [AROFlickrOrder](https://proceedings.neurips.cc/paper_files/paper/2023/hash/63461de0b4cb760fc498e85b18a7fe81-Abstract-Datasets_and_Benchmarks.html) (Hsieh et al., 2024) | ['eng'] | ImageTextPairClassification | i2t | [Encyclopaedic] | None | None |
+| [AROVisualAttribution](https://openreview.net/forum?id=KRLUvxh8uaX) (Yuksekgonul et al., 2023) | ['eng'] | ImageTextPairClassification | i2t | [Encyclopaedic] | None | None |
+| [AROVisualRelation](https://openreview.net/forum?id=KRLUvxh8uaX) (Yuksekgonul et al., 2023) | ['eng'] | ImageTextPairClassification | i2t | [Encyclopaedic] | None | None |
| [ATEC](https://aclanthology.org/2021.emnlp-main.357) | ['cmn'] | STS | s2s | | None | None |
| [AfriSentiClassification](https://arxiv.org/abs/2302.08956) | ['amh', 'arq', 'ary', 'hau', 'ibo', 'kin', 'pcm', 'por', 'swa', 'tso', 'twi', 'yor'] | Classification | s2s | [Social, Written] | None | None |
| [AfriSentiLangClassification](https://huggingface.co/datasets/HausaNLP/afrisenti-lid-data/) | ['amh', 'arq', 'ary', 'hau', 'ibo', 'kin', 'pcm', 'por', 'swa', 'tso', 'twi', 'yor'] | Classification | s2s | [Social, Written] | None | None |
@@ -39,6 +43,10 @@ The following tables give you an overview of the tasks in MTEB.
| [Assin2STS](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) (Real et al., 2020) | ['por'] | STS | s2s | [Written] | None | None |
| [AutoRAGRetrieval](https://arxiv.org/abs/2410.20878) (Dongkyu Kim, 2024) | ['kor'] | Retrieval | s2p | [Financial, Government, Legal, Medical, Social] | {'test': 834} | {'test': {'number_of_characters': 894.22, 'num_samples': 834, 'num_queries': 114, 'num_documents': 720, 'average_document_length': 1.15, 'average_query_length': 0.61, 'average_relevant_docs_per_query': 1.0}} |
| [BIOSSES](https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html) (Soğancıoğlu et al., 2017) | ['eng'] | STS | s2s | [Medical] | None | None |
+| [BLINKIT2IMultiChoice](https://arxiv.org/abs/2404.12390) (Fu et al., 2024) | ['eng'] | Any2AnyMultiChoice | it2i | [Encyclopaedic] | None | None |
+| [BLINKIT2IRetrieval](https://arxiv.org/abs/2404.12390) (Fu et al., 2024) | ['eng'] | Any2AnyRetrieval | it2i | [Encyclopaedic] | None | None |
+| [BLINKIT2TMultiChoice](https://arxiv.org/abs/2404.12390) (Fu et al., 2024) | ['eng'] | Any2AnyMultiChoice | it2t | [Encyclopaedic] | None | None |
+| [BLINKIT2TRetrieval](https://arxiv.org/abs/2404.12390) (Fu et al., 2024) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | None | None |
| [BQ](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None |
| [BSARDRetrieval](https://huggingface.co/datasets/maastrichtlawtech/bsard) (Louis et al., 2022) | ['fra'] | Retrieval | s2p | [Legal, Spoken] | None | None |
| [BUCC.v2](https://comparable.limsi.fr/bucc2018/bucc2018-task.html) | ['cmn', 'deu', 'eng', 'fra', 'rus'] | BitextMining | s2s | [Written] | {'test': 35000} | {'test': {'num_samples': 35000, 'number_of_characters': 6640032, 'unique_pairs': 34978, 'min_sentence1_length': 16, 'average_sentence1_length': 99.11, 'max_sentence1_length': 204, 'unique_sentence1': 34978, 'min_sentence2_length': 42, 'average_sentence2_length': 90.61, 'max_sentence2_length': 159, 'unique_sentence2': 25306, 'hf_subset_descriptive_stats': {'de-en': {'num_samples': 9580, 'number_of_characters': 1919197, 'unique_pairs': 9573, 'min_sentence1_length': 50, 'average_sentence1_length': 109.08, 'max_sentence1_length': 204, 'unique_sentence1': 9573, 'min_sentence2_length': 46, 'average_sentence2_length': 91.25, 'max_sentence2_length': 155, 'unique_sentence2': 9570}, 'fr-en': {'num_samples': 9086, 'number_of_characters': 1677545, 'unique_pairs': 9081, 'min_sentence1_length': 43, 'average_sentence1_length': 99.32, 'max_sentence1_length': 174, 'unique_sentence1': 9081, 'min_sentence2_length': 42, 'average_sentence2_length': 85.31, 'max_sentence2_length': 159, 'unique_sentence2': 9076}, 'ru-en': {'num_samples': 14435, 'number_of_characters': 2808206, 'unique_pairs': 14425, 'min_sentence1_length': 40, 'average_sentence1_length': 101.66, 'max_sentence1_length': 186, 'unique_sentence1': 14425, 'min_sentence2_length': 45, 'average_sentence2_length': 92.88, 'max_sentence2_length': 159, 'unique_sentence2': 14424}, 'zh-en': {'num_samples': 1899, 'number_of_characters': 235084, 'unique_pairs': 1899, 'min_sentence1_length': 16, 'average_sentence1_length': 28.43, 'max_sentence1_length': 40, 'unique_sentence1': 1899, 'min_sentence2_length': 48, 'average_sentence2_length': 95.36, 'max_sentence2_length': 159, 'unique_sentence2': 1899}}}} |
@@ -52,17 +60,28 @@ The following tables give you an overview of the tasks in MTEB.
| [BigPatentClustering.v2](https://huggingface.co/datasets/NortheasternUniversity/big_patent) (Eva Sharma and Chen Li and Lu Wang, 2019) | ['eng'] | Clustering | p2p | [Legal, Written] | None | None |
| [BiorxivClusteringP2P.v2](https://api.biorxiv.org/) | ['eng'] | Clustering | p2p | [Academic, Written] | None | None |
| [BiorxivClusteringS2S.v2](https://api.biorxiv.org/) | ['eng'] | Clustering | s2s | [Academic, Written] | None | None |
+| [Birdsnap](https://openaccess.thecvf.com/content_cvpr_2014/html/Berg_Birdsnap_Large-scale_Fine-grained_2014_CVPR_paper.html) (Berg et al., 2014) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None |
+| [BirdsnapZeroShot](https://openaccess.thecvf.com/content_cvpr_2014/html/Berg_Birdsnap_Large-scale_Fine-grained_2014_CVPR_paper.html) (Berg et al., 2014) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None |
| [BlurbsClusteringP2P.v2](https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html) (Steffen Remus, 2019) | ['deu'] | Clustering | p2p | [Fiction, Written] | None | None |
| [BlurbsClusteringS2S.v2](https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html) (Steffen Remus, 2019) | ['deu'] | Clustering | s2s | [Fiction, Written] | None | None |
| [BornholmBitextMining](https://aclanthology.org/W19-6138/) | ['dan'] | BitextMining | s2s | [Fiction, Social, Web, Written] | {'test': 500} | {'test': {'num_samples': 500, 'number_of_characters': 44361, 'unique_pairs': 500, 'min_sentence1_length': 1, 'average_sentence1_length': 49.83, 'max_sentence1_length': 555, 'unique_sentence1': 497, 'min_sentence2_length': 5, 'average_sentence2_length': 38.89, 'max_sentence2_length': 453, 'unique_sentence2': 491}} |
| [BrazilianToxicTweetsClassification](https://paperswithcode.com/dataset/told-br) (Joao Augusto Leite and Diego F. Silva and Kalina Bontcheva and Carolina Scarton, 2020) | ['por'] | MultilabelClassification | s2s | [Constructed, Written] | None | None |
-| [BrightRetrieval](https://huggingface.co/datasets/xlangai/BRIGHT) (Hongjin Su, 2024) | ['eng'] | Retrieval | s2p | [Non-fiction] | None | None |
+| [BrightRetrieval](https://huggingface.co/datasets/xlangai/BRIGHT) (Hongjin Su, 2024) | ['eng'] | Retrieval | s2p | [Non-fiction, Written] | None | None |
| [BulgarianStoreReviewSentimentClassfication](https://doi.org/10.7910/DVN/TXIK9P) (Georgieva-Trifonova et al., 2018) | ['bul'] | Classification | s2s | [Reviews, Written] | None | None |
| [CBD](http://2019.poleval.pl/files/poleval2019.pdf) | ['pol'] | Classification | s2s | [Social, Written] | None | None |
| [CDSC-E](https://aclanthology.org/P17-1073.pdf) | ['pol'] | PairClassification | s2s | [Written] | None | None |
| [CDSC-R](https://aclanthology.org/P17-1073.pdf) | ['pol'] | STS | s2s | [Web, Written] | None | None |
| [CEDRClassification](https://www.sciencedirect.com/science/article/pii/S1877050921013247) (Sboev et al., 2021) | ['rus'] | MultilabelClassification | s2s | [Blog, Social, Web, Written] | {'test': 1882, 'train': 7528} | {'test': {'num_samples': 1882, 'number_of_characters': 171649, 'number_texts_in_train': 7, 'min_text_length': 6, 'average_text_length': 91.21, 'max_text_length': 220, 'unique_texts': 1875, 'min_labels_per_text': 0, 'average_label_per_text': 0.62, 'max_labels_per_text': 2, 'unique_labels': 6, 'labels': {'None': {'count': 734}, '3': {'count': 141}, '2': {'count': 170}, '1': {'count': 379}, '0': {'count': 353}, '4': {'count': 125}}}, 'train': {'num_samples': 7528, 'number_of_characters': 697322, 'number_texts_in_train': None, 'min_text_length': 5, 'average_text_length': 92.63, 'max_text_length': 280, 'unique_texts': 7500, 'min_labels_per_text': 0, 'average_label_per_text': 0.61, 'max_labels_per_text': 3, 'unique_labels': 6, 'labels': {'None': {'count': 3043}, '2': {'count': 607}, '0': {'count': 1569}, '3': {'count': 589}, '1': {'count': 1417}, '4': {'count': 411}}}} |
| [CExaPPC](https://github.com/exaco/exappc) | ['fas'] | PairClassification | s2s | [Social, Web] | None | None |
+| [CIFAR10](https://huggingface.co/datasets/uoft-cs/cifar10) (Alex Krizhevsky, 2009) | ['eng'] | ImageClassification | i2i | [Web] | None | None |
+| [CIFAR100](https://huggingface.co/datasets/uoft-cs/cifar100) (Alex Krizhevsky, 2009) | ['eng'] | ImageClassification | i2t | [Web] | None | None |
+| [CIFAR100Clustering](https://huggingface.co/datasets/uoft-cs/cifar100) (Alex Krizhevsky, 2009) | ['eng'] | ImageClustering | i2t | [Web] | None | None |
+| [CIFAR100ZeroShot](https://huggingface.co/datasets/uoft-cs/cifar100) (Alex Krizhevsky, 2009) | ['eng'] | ZeroShotClassification | i2t | [Web] | None | None |
+| [CIFAR10Clustering](https://huggingface.co/datasets/uoft-cs/cifar10) (Alex Krizhevsky, 2009) | ['eng'] | ImageClustering | i2i | [Web] | None | None |
+| [CIFAR10ZeroShot](https://huggingface.co/datasets/uoft-cs/cifar10) (Alex Krizhevsky, 2009) | ['eng'] | ZeroShotClassification | i2t | [Web] | None | None |
+| [CIRRIT2IRetrieval](https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Image_Retrieval_on_Real-Life_Images_With_Pre-Trained_Vision-and-Language_Models_ICCV_2021_paper.html) (Liu et al., 2021) | ['eng'] | Any2AnyRetrieval | it2i | [Encyclopaedic] | None | None |
+| [CLEVRCountZeroShot](https://openaccess.thecvf.com/content_cvpr_2017/html/Johnson_CLEVR_A_Diagnostic_CVPR_2017_paper.html) (Johnson et al., 2017) | ['eng'] | ZeroShotClassification | i2t | [Constructed] | None | None |
+| [CLEVRZeroShot](https://openaccess.thecvf.com/content_cvpr_2017/html/Johnson_CLEVR_A_Diagnostic_CVPR_2017_paper.html) (Johnson et al., 2017) | ['eng'] | ZeroShotClassification | i2t | [Constructed] | None | None |
| [CLSClusteringP2P.v2](https://arxiv.org/abs/2209.05034) (Yudong Li, 2022) | ['cmn'] | Clustering | p2p | [Academic, Written] | None | None |
| [CLSClusteringS2S.v2](https://arxiv.org/abs/2209.05034) (Yudong Li, 2022) | ['cmn'] | Clustering | s2s | [Academic, Written] | None | None |
| [CMedQAv1-reranking](https://github.com/zhangsheng93/cMedQA) (Zhang et al., 2017) | ['cmn'] | Reranking | s2s | [Medical, Written] | None | None |
@@ -136,7 +155,14 @@ The following tables give you an overview of the tasks in MTEB.
| [CUADUnlimitedAllYouCanEatLicenseLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
| [CUADVolumeRestrictionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
| [CUADWarrantyDurationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
+| [CUB200I2IRetrieval](https://www.florian-schroff.de/publications/CUB-200.pdf) (Welinder et al., 2010) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | None | None |
| [CUREv1](https://huggingface.co/datasets/clinia/CUREv1) | ['eng', 'fra', 'spa'] | Retrieval | s2p | [Academic, Medical, Written] | None | None |
+| [CVBenchCount](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | Any2TextMutipleChoice | it2t | [Academic] | None | None |
+| [CVBenchDepth](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | Any2TextMutipleChoice | it2t | [Academic] | None | None |
+| [CVBenchDistance](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | Any2TextMutipleChoice | it2t | [Academic] | None | None |
+| [CVBenchRelation](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | Any2TextMutipleChoice | it2t | [Academic] | None | None |
+| [Caltech101](https://ieeexplore.ieee.org/document/1384978) (Li Fei-Fei, 2004) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None |
+| [Caltech101ZeroShot](https://ieeexplore.ieee.org/document/1384978) (Li Fei-Fei, 2004) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None |
| [CanadaTaxCourtOutcomesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
| [CataloniaTweetClassification](https://aclanthology.org/2020.lrec-1.171/) | ['cat', 'spa'] | Classification | s2s | [Government, Social, Written] | None | None |
| [ChemHotpotQARetrieval](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Retrieval | s2p | [Chemistry] | None | None |
@@ -174,6 +200,8 @@ The following tables give you an overview of the tasks in MTEB.
| [Core17InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionRetrieval | s2p | [News, Written] | {'test': 19919} | {'test': {'num_samples': 19919, 'num_docs': 19899, 'num_queries': 20, 'number_of_characters': 44450333, 'min_document_length': 7, 'average_document_length': 2233.03, 'max_document_length': 2959, 'unique_docs': 19143, 'min_query_length': 55, 'average_query_length': 109.75, 'max_query_length': 278, 'unique_queries': 20, 'min_instruction_length': 102, 'average_instruction_length': 295.55, 'max_instruction_length': 811, 'unique_instructions': 20, 'min_changed_instruction_length': 151, 'average_changed_instruction_length': 355.2, 'max_changed_instruction_length': 837, 'unique_changed_instructions': 20, 'min_average_relevant_docs_per_query': 4, 'average_relevant_docs_per_query': 32.7, 'max_average_relevant_docs_per_query': 55, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}} |
| [CorporateLobbyingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
| [CosQA](https://arxiv.org/abs/2105.13239) (Junjie Huang, 2021) | ['eng', 'python'] | Retrieval | p2p | [Programming, Written] | {'test': 21104} | {'test': {'number_of_characters': 5728450, 'num_samples': 21104, 'num_queries': 500, 'num_documents': 20604, 'min_document_length': 18, 'average_document_length': 0.89, 'max_document_length': 83, 'unique_documents': 20604, 'min_query_length': 88, 'average_query_length': 11420.09, 'max_query_length': 6396, 'unique_queries': 500, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 500}} |
+| [Country211](https://huggingface.co/datasets/clip-benchmark/wds_country211) (Radford et al., 2021) | ['eng'] | ImageClassification | i2i | [Scene] | None | None |
+| [Country211ZeroShot](https://huggingface.co/datasets/clip-benchmark/wds_country211) (Radford et al., 2021) | ['eng'] | ZeroShotClassification | i2t | [Scene] | None | None |
| [CovidRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | None |
| [CrossLingualSemanticDiscriminationWMT19](https://huggingface.co/datasets/Andrianos/clsd_wmt19_21) | ['deu', 'fra'] | Retrieval | s2s | [News, Written] | None | None |
| [CrossLingualSemanticDiscriminationWMT21](https://huggingface.co/datasets/Andrianos/clsd_wmt19_21) | ['deu', 'fra'] | Retrieval | s2s | [News, Written] | None | None |
@@ -188,6 +216,8 @@ The following tables give you an overview of the tasks in MTEB.
| [DBPediaHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None |
| [DBpediaClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Encyclopaedic, Written] | None | None |
| [DKHateClassification](https://aclanthology.org/2020.lrec-1.430/) | ['dan'] | Classification | s2s | [Social, Written] | None | None |
+| [DTD](https://www.robots.ox.ac.uk/~vgg/data/dtd/) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None |
+| [DTDZeroShot](https://www.robots.ox.ac.uk/~vgg/data/dtd/) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None |
| [DalajClassification](https://spraakbanken.gu.se/en/resources/superlim) | ['swe'] | Classification | s2s | [Non-fiction, Written] | None | None |
| [DanFeverRetrieval](https://aclanthology.org/2021.nodalida-main.47/) | ['dan'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Spoken] | None | None |
| [DanishPoliticalCommentsClassification](https://huggingface.co/datasets/danish_political_comments) (Mads Guldborg Kjeldgaard Kongsbak, 2019) | ['dan'] | Classification | s2s | [Social, Written] | None | None |
@@ -204,14 +234,23 @@ The following tables give you an overview of the tasks in MTEB.
| [Diversity6LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
| [DuRetrieval](https://aclanthology.org/2022.emnlp-main.357.pdf) (Yifu Qiu, 2022) | ['cmn'] | Retrieval | s2p | | None | None |
| [DutchBookReviewSentimentClassification](https://github.com/benjaminvdb/DBRD) (Benjamin et al., 2019) | ['nld'] | Classification | s2s | [Reviews, Written] | None | None |
+| [EDIST2ITRetrieval](https://aclanthology.org/2023.emnlp-main.297/) (Liu et al., 2023) | ['eng'] | Any2AnyRetrieval | t2it | [News] | None | None |
| [ESCIReranking](https://github.com/amazon-science/esci-data/) (Chandan K. Reddy, 2022) | ['eng', 'jpn', 'spa'] | Reranking | s2p | [Written] | {'test': 29285} | {'test': {'num_samples': 29285, 'number_of_characters': 254538331, 'num_positive': 271416, 'num_negative': 44235, 'min_query_length': 1, 'avg_query_length': 19.69, 'max_query_length': 151, 'unique_query': 29269, 'min_positive_length': 1, 'avg_positive_length': 803.92, 'max_positive_length': 8640, 'unique_positive': 217712, 'min_negative_length': 1, 'avg_negative_length': 808.5, 'max_negative_length': 4441, 'unique_negative': 39551, 'hf_subset_descriptive_stats': {'us': {'num_samples': 21296, 'number_of_characters': 186915609, 'num_positive': 189375, 'num_negative': 25463, 'min_query_length': 1, 'avg_query_length': 21.44, 'max_query_length': 151, 'unique_query': 21296, 'min_positive_length': 1, 'avg_positive_length': 868.37, 'max_positive_length': 5545, 'unique_positive': 150734, 'min_negative_length': 1, 'avg_negative_length': 864.45, 'max_negative_length': 3779, 'unique_negative': 23073}, 'es': {'num_samples': 3703, 'number_of_characters': 48861389, 'num_positive': 39110, 'num_negative': 10183, 'min_query_length': 3, 'avg_query_length': 20.68, 'max_query_length': 59, 'unique_query': 3703, 'min_positive_length': 1, 'avg_positive_length': 980.96, 'max_positive_length': 8640, 'unique_positive': 32921, 'min_negative_length': 1, 'avg_negative_length': 1023.22, 'max_negative_length': 4441, 'unique_negative': 9285}, 'jp': {'num_samples': 4286, 'number_of_characters': 18761333, 'num_positive': 42931, 'num_negative': 8589, 'min_query_length': 1, 'avg_query_length': 10.15, 'max_query_length': 60, 'unique_query': 4286, 'min_positive_length': 1, 'avg_positive_length': 358.36, 'max_positive_length': 3488, 'unique_positive': 35165, 'min_negative_length': 1, 'avg_negative_length': 388.08, 'max_negative_length': 3940, 'unique_negative': 7289}}}} |
| [EcomRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | None |
| [EightTagsClustering.v2](https://aclanthology.org/2020.lrec-1.207.pdf) | ['pol'] | Clustering | s2s | [Social, Written] | None | None |
| [EmotionClassification](https://www.aclweb.org/anthology/D18-1404) | ['eng'] | Classification | s2s | [Social, Written] | None | None |
+| [EncyclopediaVQAIT2ITRetrieval](https://github.com/google-research/google-research/tree/master/encyclopedic_vqa) (Mensink et al., 2023) | ['eng'] | Any2AnyRetrieval | it2it | [Encyclopaedic] | None | None |
| [EstQA](https://www.semanticscholar.org/paper/Extractive-Question-Answering-for-Estonian-Language-182912IAPM-Alum%C3%A4e/ea4f60ab36cadca059c880678bc4c51e293a85d6?utm_source=direct_link) | ['est'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None |
| [EstonianValenceClassification](https://figshare.com/articles/dataset/Estonian_Valence_Corpus_Eesti_valentsikorpus/24517054) | ['est'] | Classification | s2s | [News, Written] | None | None |
+| [EuroSAT](https://ieeexplore.ieee.org/document/8736785) (Helber et al., 2019) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None |
+| [EuroSATZeroShot](https://ieeexplore.ieee.org/document/8736785) (Helber et al., 2019) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None |
+| [FER2013](https://arxiv.org/abs/1412.6572) (Ian J. Goodfellow, 2015) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None |
+| [FER2013ZeroShot](https://arxiv.org/abs/1412.6572) (Ian J. Goodfellow, 2015) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None |
| [FEVER](https://fever.ai/) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None |
| [FEVERHardNegatives](https://fever.ai/) | ['eng'] | Retrieval | s2p | | None | None |
+| [FGVCAircraft](https://arxiv.org/abs/1306.5151) (Subhransu Maji, 2013) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None |
+| [FGVCAircraftZeroShot](https://arxiv.org/abs/1306.5151) (Subhransu Maji, 2013) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None |
+| [FORBI2IRetrieval](https://github.com/pxiangwu/FORB) (Pengxiang Wu, 2023) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | None | None |
| [FQuADRetrieval](https://huggingface.co/datasets/manu/fquad2_test) | ['fra'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None |
| [FaithDial](https://mcgill-nlp.github.io/FaithDial) (Dziri et al., 2022) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None |
| [FalseFriendsGermanEnglish](https://drive.google.com/file/d/1jgq0nBnV-UiYNxbKNrrr2gxDEHm-DMKH/view?usp=share_link) | ['deu'] | PairClassification | s2s | [Written] | None | None |
@@ -219,6 +258,9 @@ The following tables give you an overview of the tasks in MTEB.
| [FarsTail](https://link.springer.com/article/10.1007/s00500-023-08959-3) (Amirkhani et al., 2023) | ['fas'] | PairClassification | s2s | [Academic, Written] | None | None |
| [FarsiParaphraseDetection](https://huggingface.co/datasets/alighasemi/farsi_paraphrase_detection) | ['fas'] | PairClassification | s2s | | None | None |
| [Farsick](https://github.com/ZahraGhasemi-AI/FarSick) | ['fas'] | STS | s2s | | None | None |
+| [Fashion200kI2TRetrieval](https://openaccess.thecvf.com/content_iccv_2017/html/Han_Automatic_Spatially-Aware_Fashion_ICCV_2017_paper.html) (Han et al., 2017) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | None | None |
+| [Fashion200kT2IRetrieval](https://openaccess.thecvf.com/content_iccv_2017/html/Han_Automatic_Spatially-Aware_Fashion_ICCV_2017_paper.html) (Han et al., 2017) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | None | None |
+| [FashionIQIT2IRetrieval](https://openaccess.thecvf.com/content/CVPR2021/html/Wu_Fashion_IQ_A_New_Dataset_Towards_Retrieving_Images_by_Natural_CVPR_2021_paper.html) (Wu et al., 2021) | ['eng'] | Any2AnyRetrieval | it2i | [Encyclopaedic] | None | None |
| [FeedbackQARetrieval](https://arxiv.org/abs/2204.03025) | ['eng'] | Retrieval | s2p | [Government, Medical, Web, Written] | None | None |
| [FiQA-PL](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['pol'] | Retrieval | s2p | [Financial, Written] | None | None |
| [FiQA2018](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['eng'] | Retrieval | s2p | [Financial, Written] | None | None |
@@ -228,13 +270,21 @@ The following tables give you an overview of the tasks in MTEB.
| [FinParaSTS](https://huggingface.co/datasets/TurkuNLP/turku_paraphrase_corpus) | ['fin'] | STS | s2s | [News, Subtitles, Written] | None | None |
| [FinToxicityClassification](https://aclanthology.org/2023.nodalida-1.68) | ['fin'] | Classification | s2s | [News, Written] | None | None |
| [FinancialPhrasebankClassification](https://arxiv.org/abs/1307.5336) (P. Malo, 2014) | ['eng'] | Classification | s2s | [Financial, News, Written] | None | None |
+| [Flickr30kI2TRetrieval](https://www.semanticscholar.org/paper/From-image-descriptions-to-visual-denotations%3A-New-Young-Lai/44040913380206991b1991daf1192942e038fe31) (Peter Young, 2014) | ['eng'] | Any2AnyRetrieval | i2t | [Web, Written] | None | None |
+| [Flickr30kT2IRetrieval](https://www.semanticscholar.org/paper/From-image-descriptions-to-visual-denotations%3A-New-Young-Lai/44040913380206991b1991daf1192942e038fe31) (Peter Young, 2014) | ['eng'] | Any2AnyRetrieval | t2i | [Web, Written] | None | None |
| [FloresBitextMining](https://huggingface.co/datasets/facebook/flores) (Goyal et al., 2022) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | BitextMining | s2s | [Encyclopaedic, Non-fiction, Written] | None | None |
+| [Food101Classification](https://huggingface.co/datasets/ethz/food101) (Bossard et al., 2014) | ['eng'] | ImageClassification | i2i | [Web] | None | None |
+| [Food101ZeroShot](https://huggingface.co/datasets/ethz/food101) (Bossard et al., 2014) | ['eng'] | ZeroShotClassification | i2t | [Web] | None | None |
| [FrenchBookReviews](https://huggingface.co/datasets/Abirate/french_book_reviews) | ['fra'] | Classification | s2s | [Reviews, Written] | None | None |
| [FrenkEnClassification](https://arxiv.org/abs/1906.02045) (Nikola Ljubešić, 2019) | ['eng'] | Classification | s2s | [Social, Written] | None | None |
| [FrenkHrClassification](https://arxiv.org/abs/1906.02045) (Nikola Ljubešić, 2019) | ['hrv'] | Classification | s2s | [Social, Written] | None | None |
| [FrenkSlClassification](https://arxiv.org/pdf/1906.02045) (Nikola Ljubešić, 2019) | ['slv'] | Classification | s2s | [Social, Written] | None | None |
| [FunctionOfDecisionSectionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
+| [GLDv2I2IRetrieval](https://openaccess.thecvf.com/content_CVPR_2020/html/Weyand_Google_Landmarks_Dataset_v2_-_A_Large-Scale_Benchmark_for_Instance-Level_CVPR_2020_paper.html) (Weyand et al., 2020) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | None | None |
+| [GLDv2I2TRetrieval](https://openaccess.thecvf.com/content_CVPR_2020/html/Weyand_Google_Landmarks_Dataset_v2_-_A_Large-Scale_Benchmark_for_Instance-Level_CVPR_2020_paper.html) (Weyand et al., 2020) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | None | None |
| [GPUSpeedTask](https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/blob/c8376f967d1294419be1d3eb41217d04cd3a65d3/src/seb/registered_tasks/speed.py#L83-L96) | ['eng'] | Speed | s2s | [Fiction, Written] | None | None |
+| [GTSRB](https://benchmark.ini.rub.de/) (Stallkamp et al., 2011) | ['eng'] | ImageClassification | i2i | [Scene] | None | None |
+| [GTSRBZeroShot](https://benchmark.ini.rub.de/) (Stallkamp et al., 2011) | ['eng'] | ZeroShotClassification | i2t | [Scene] | None | None |
| [GeoreviewClassification](https://github.com/yandex/geo-reviews-dataset-2023) | ['rus'] | Classification | p2p | [Reviews, Written] | None | None |
| [GeoreviewClusteringP2P](https://github.com/yandex/geo-reviews-dataset-2023) | ['rus'] | Clustering | p2p | [Reviews, Written] | None | None |
| [GeorgianFAQRetrieval](https://huggingface.co/datasets/jupyterjazz/georgian-faq) | ['kat'] | Retrieval | s2p | [Web, Written] | None | None |
@@ -252,6 +302,8 @@ The following tables give you an overview of the tasks in MTEB.
| [HagridRetrieval](https://github.com/project-miracl/hagrid) (Ehsan Kamalloo, 2023) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None |
| [HamshahriClustring](https://github.com/mallahyari/Farsi-datasets) | ['fas'] | Clustering | p2p | [News] | None | None |
| [HateSpeechPortugueseClassification](https://aclanthology.org/W19-3510) | ['por'] | Classification | s2s | [Social, Written] | None | None |
+| [HatefulMemesI2TRetrieval](https://arxiv.org/pdf/2005.04790) (Kiela et al., 2020) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | None | None |
+| [HatefulMemesT2IRetrieval](https://arxiv.org/pdf/2005.04790) (Kiela et al., 2020) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | None | None |
| [HeadlineClassification](https://aclanthology.org/2020.ngt-1.6/) | ['rus'] | Classification | s2s | [News, Written] | None | None |
| [HebrewSentimentAnalysis](https://huggingface.co/datasets/hebrew_sentiment) | ['heb'] | Classification | s2s | [Reviews, Written] | None | None |
| [HellaSwag](https://rowanzellers.com/hellaswag/) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None |
@@ -268,6 +320,12 @@ The following tables give you an overview of the tasks in MTEB.
| [IN22ConvBitextMining](https://huggingface.co/datasets/ai4bharat/IN22-Conv) (Jay Gala, 2023) | ['asm', 'ben', 'brx', 'doi', 'eng', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | BitextMining | s2s | [Fiction, Social, Spoken, Spoken] | {'test': 760518} | {'test': {'num_samples': 760518, 'number_of_characters': 82637104, 'unique_pairs': 759283, 'min_sentence1_length': 3, 'average_sentence1_length': 54.33, 'max_sentence1_length': 239, 'unique_sentence1': 34430, 'min_sentence2_length': 3, 'average_sentence2_length': 54.33, 'max_sentence2_length': 239, 'unique_sentence2': 34430, 'hf_subset_descriptive_stats': {'asm_Beng-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155988, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'asm_Beng-brx_Deva': {'num_samples': 1503, 'number_of_characters': 162044, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'asm_Beng-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167032, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'asm_Beng-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160716, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'asm_Beng-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156282, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'asm_Beng-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 158269, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'asm_Beng-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159964, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'asm_Beng-kan_Knda': {'num_samples': 1503, 'number_of_characters': 165177, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'asm_Beng-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164681, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'asm_Beng-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162408, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'asm_Beng-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172838, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'asm_Beng-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162747, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'asm_Beng-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157316, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'asm_Beng-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160906, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'asm_Beng-ory_Orya': {'num_samples': 1503, 'number_of_characters': 164223, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'asm_Beng-pan_Guru': {'num_samples': 1503, 'number_of_characters': 160201, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'asm_Beng-san_Deva': {'num_samples': 1503, 'number_of_characters': 158093, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'asm_Beng-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169379, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'asm_Beng-snd_Deva': {'num_samples': 1503, 'number_of_characters': 162623, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'asm_Beng-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174866, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'asm_Beng-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157690, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'asm_Beng-urd_Arab': {'num_samples': 1503, 'number_of_characters': 161305, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'ben_Beng-asm_Beng': {'num_samples': 1503, 'number_of_characters': 155988, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'ben_Beng-brx_Deva': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'ben_Beng-doi_Deva': {'num_samples': 1503, 'number_of_characters': 161436, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'ben_Beng-eng_Latn': {'num_samples': 1503, 'number_of_characters': 155120, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'ben_Beng-gom_Deva': {'num_samples': 1503, 'number_of_characters': 150686, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'ben_Beng-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 152673, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'ben_Beng-hin_Deva': {'num_samples': 1503, 'number_of_characters': 154368, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'ben_Beng-kan_Knda': {'num_samples': 1503, 'number_of_characters': 159581, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'ben_Beng-kas_Arab': {'num_samples': 1503, 'number_of_characters': 159085, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'ben_Beng-mai_Deva': {'num_samples': 1503, 'number_of_characters': 156812, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'ben_Beng-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 167242, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'ben_Beng-mar_Deva': {'num_samples': 1503, 'number_of_characters': 157151, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'ben_Beng-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 151720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'ben_Beng-npi_Deva': {'num_samples': 1503, 'number_of_characters': 155310, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'ben_Beng-ory_Orya': {'num_samples': 1503, 'number_of_characters': 158627, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'ben_Beng-pan_Guru': {'num_samples': 1503, 'number_of_characters': 154605, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'ben_Beng-san_Deva': {'num_samples': 1503, 'number_of_characters': 152497, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'ben_Beng-sat_Olck': {'num_samples': 1503, 'number_of_characters': 163783, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'ben_Beng-snd_Deva': {'num_samples': 1503, 'number_of_characters': 157027, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'ben_Beng-tam_Taml': {'num_samples': 1503, 'number_of_characters': 169270, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'ben_Beng-tel_Telu': {'num_samples': 1503, 'number_of_characters': 152094, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'ben_Beng-urd_Arab': {'num_samples': 1503, 'number_of_characters': 155709, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'brx_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162044, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'brx_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'brx_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167492, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'brx_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161176, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'brx_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156742, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'brx_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'brx_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 160424, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'brx_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 165637, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'brx_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165141, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'brx_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162868, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'brx_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'brx_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163207, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'brx_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157776, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'brx_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'brx_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 164683, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'brx_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 160661, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'brx_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 158553, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'brx_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169839, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'brx_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163083, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'brx_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175326, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'brx_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158150, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'brx_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 161765, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'doi_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 167032, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'doi_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 161436, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'doi_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 167492, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'doi_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 166164, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'doi_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'doi_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 163717, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'doi_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 165412, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'doi_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 170625, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'doi_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 170129, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'doi_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 167856, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'doi_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 178286, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'doi_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 168195, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'doi_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 162764, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'doi_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 166354, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'doi_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 169671, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'doi_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 165649, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'doi_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 163541, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'doi_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 174827, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'doi_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 168071, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'doi_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 180314, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'doi_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 163138, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'doi_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 166753, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'eng_Latn-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160716, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'eng_Latn-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155120, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'eng_Latn-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161176, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'eng_Latn-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166164, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'eng_Latn-gom_Deva': {'num_samples': 1503, 'number_of_characters': 155414, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'eng_Latn-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157401, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'eng_Latn-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159096, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'eng_Latn-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164309, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'eng_Latn-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163813, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'eng_Latn-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161540, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'eng_Latn-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171970, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'eng_Latn-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161879, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'eng_Latn-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'eng_Latn-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160038, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'eng_Latn-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163355, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'eng_Latn-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159333, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'eng_Latn-san_Deva': {'num_samples': 1503, 'number_of_characters': 157225, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'eng_Latn-sat_Olck': {'num_samples': 1503, 'number_of_characters': 168511, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'eng_Latn-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161755, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'eng_Latn-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173998, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'eng_Latn-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156822, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'eng_Latn-urd_Arab': {'num_samples': 1503, 'number_of_characters': 160437, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'gom_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 156282, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'gom_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 150686, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'gom_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 156742, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'gom_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'gom_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 155414, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'gom_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 152967, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'gom_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 154662, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'gom_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 159875, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'gom_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 159379, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'gom_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 157106, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'gom_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 167536, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'gom_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 157445, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'gom_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 152014, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'gom_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 155604, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'gom_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 158921, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'gom_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 154899, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'gom_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 152791, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'gom_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 164077, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'gom_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 157321, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'gom_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 169564, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'gom_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 152388, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'gom_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 156003, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'guj_Gujr-asm_Beng': {'num_samples': 1503, 'number_of_characters': 158269, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'guj_Gujr-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152673, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'guj_Gujr-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'guj_Gujr-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163717, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'guj_Gujr-eng_Latn': {'num_samples': 1503, 'number_of_characters': 157401, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'guj_Gujr-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152967, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'guj_Gujr-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156649, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'guj_Gujr-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161862, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'guj_Gujr-kas_Arab': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'guj_Gujr-mai_Deva': {'num_samples': 1503, 'number_of_characters': 159093, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'guj_Gujr-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 169523, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'guj_Gujr-mar_Deva': {'num_samples': 1503, 'number_of_characters': 159432, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'guj_Gujr-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 154001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'guj_Gujr-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157591, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'guj_Gujr-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160908, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'guj_Gujr-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156886, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'guj_Gujr-san_Deva': {'num_samples': 1503, 'number_of_characters': 154778, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'guj_Gujr-sat_Olck': {'num_samples': 1503, 'number_of_characters': 166064, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'guj_Gujr-snd_Deva': {'num_samples': 1503, 'number_of_characters': 159308, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'guj_Gujr-tam_Taml': {'num_samples': 1503, 'number_of_characters': 171551, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'guj_Gujr-tel_Telu': {'num_samples': 1503, 'number_of_characters': 154375, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'guj_Gujr-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157990, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'hin_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 159964, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'hin_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 154368, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'hin_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 160424, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'hin_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 165412, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'hin_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 159096, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'hin_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 154662, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'hin_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 156649, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'hin_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 163557, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'hin_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163061, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'hin_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 160788, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'hin_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171218, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'hin_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161127, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'hin_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 155696, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'hin_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 159286, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'hin_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 162603, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'hin_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 158581, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'hin_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 156473, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'hin_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 167759, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'hin_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'hin_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173246, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'hin_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156070, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'hin_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 159685, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'kan_Knda-asm_Beng': {'num_samples': 1503, 'number_of_characters': 165177, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'kan_Knda-ben_Beng': {'num_samples': 1503, 'number_of_characters': 159581, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'kan_Knda-brx_Deva': {'num_samples': 1503, 'number_of_characters': 165637, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'kan_Knda-doi_Deva': {'num_samples': 1503, 'number_of_characters': 170625, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'kan_Knda-eng_Latn': {'num_samples': 1503, 'number_of_characters': 164309, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'kan_Knda-gom_Deva': {'num_samples': 1503, 'number_of_characters': 159875, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'kan_Knda-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 161862, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'kan_Knda-hin_Deva': {'num_samples': 1503, 'number_of_characters': 163557, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'kan_Knda-kas_Arab': {'num_samples': 1503, 'number_of_characters': 168274, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'kan_Knda-mai_Deva': {'num_samples': 1503, 'number_of_characters': 166001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'kan_Knda-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 176431, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'kan_Knda-mar_Deva': {'num_samples': 1503, 'number_of_characters': 166340, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'kan_Knda-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 160909, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'kan_Knda-npi_Deva': {'num_samples': 1503, 'number_of_characters': 164499, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'kan_Knda-ory_Orya': {'num_samples': 1503, 'number_of_characters': 167816, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'kan_Knda-pan_Guru': {'num_samples': 1503, 'number_of_characters': 163794, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'kan_Knda-san_Deva': {'num_samples': 1503, 'number_of_characters': 161686, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'kan_Knda-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172972, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'kan_Knda-snd_Deva': {'num_samples': 1503, 'number_of_characters': 166216, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'kan_Knda-tam_Taml': {'num_samples': 1503, 'number_of_characters': 178459, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'kan_Knda-tel_Telu': {'num_samples': 1503, 'number_of_characters': 161283, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'kan_Knda-urd_Arab': {'num_samples': 1503, 'number_of_characters': 164898, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'kas_Arab-asm_Beng': {'num_samples': 1503, 'number_of_characters': 164681, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'kas_Arab-ben_Beng': {'num_samples': 1503, 'number_of_characters': 159085, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'kas_Arab-brx_Deva': {'num_samples': 1503, 'number_of_characters': 165141, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'kas_Arab-doi_Deva': {'num_samples': 1503, 'number_of_characters': 170129, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'kas_Arab-eng_Latn': {'num_samples': 1503, 'number_of_characters': 163813, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'kas_Arab-gom_Deva': {'num_samples': 1503, 'number_of_characters': 159379, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'kas_Arab-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'kas_Arab-hin_Deva': {'num_samples': 1503, 'number_of_characters': 163061, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'kas_Arab-kan_Knda': {'num_samples': 1503, 'number_of_characters': 168274, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'kas_Arab-mai_Deva': {'num_samples': 1503, 'number_of_characters': 165505, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'kas_Arab-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 175935, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'kas_Arab-mar_Deva': {'num_samples': 1503, 'number_of_characters': 165844, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'kas_Arab-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 160413, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'kas_Arab-npi_Deva': {'num_samples': 1503, 'number_of_characters': 164003, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'kas_Arab-ory_Orya': {'num_samples': 1503, 'number_of_characters': 167320, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'kas_Arab-pan_Guru': {'num_samples': 1503, 'number_of_characters': 163298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'kas_Arab-san_Deva': {'num_samples': 1503, 'number_of_characters': 161190, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'kas_Arab-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172476, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'kas_Arab-snd_Deva': {'num_samples': 1503, 'number_of_characters': 165720, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'kas_Arab-tam_Taml': {'num_samples': 1503, 'number_of_characters': 177963, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'kas_Arab-tel_Telu': {'num_samples': 1503, 'number_of_characters': 160787, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'kas_Arab-urd_Arab': {'num_samples': 1503, 'number_of_characters': 164402, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mai_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162408, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mai_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 156812, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mai_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 162868, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mai_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167856, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mai_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161540, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mai_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157106, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mai_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159093, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mai_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 160788, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mai_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166001, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mai_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165505, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mai_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173662, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mai_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163571, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mai_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158140, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mai_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mai_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165047, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mai_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161025, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mai_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 158917, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mai_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170203, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mai_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163447, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mai_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175690, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mai_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158514, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mai_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162129, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mal_Mlym-asm_Beng': {'num_samples': 1503, 'number_of_characters': 172838, 'unique_pairs': 1498, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mal_Mlym-ben_Beng': {'num_samples': 1503, 'number_of_characters': 167242, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mal_Mlym-brx_Deva': {'num_samples': 1503, 'number_of_characters': 173298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mal_Mlym-doi_Deva': {'num_samples': 1503, 'number_of_characters': 178286, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mal_Mlym-eng_Latn': {'num_samples': 1503, 'number_of_characters': 171970, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mal_Mlym-gom_Deva': {'num_samples': 1503, 'number_of_characters': 167536, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mal_Mlym-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 169523, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mal_Mlym-hin_Deva': {'num_samples': 1503, 'number_of_characters': 171218, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mal_Mlym-kan_Knda': {'num_samples': 1503, 'number_of_characters': 176431, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mal_Mlym-kas_Arab': {'num_samples': 1503, 'number_of_characters': 175935, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mal_Mlym-mai_Deva': {'num_samples': 1503, 'number_of_characters': 173662, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mal_Mlym-mar_Deva': {'num_samples': 1503, 'number_of_characters': 174001, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mal_Mlym-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 168570, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mal_Mlym-npi_Deva': {'num_samples': 1503, 'number_of_characters': 172160, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mal_Mlym-ory_Orya': {'num_samples': 1503, 'number_of_characters': 175477, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mal_Mlym-pan_Guru': {'num_samples': 1503, 'number_of_characters': 171455, 'unique_pairs': 1498, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mal_Mlym-san_Deva': {'num_samples': 1503, 'number_of_characters': 169347, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mal_Mlym-sat_Olck': {'num_samples': 1503, 'number_of_characters': 180633, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mal_Mlym-snd_Deva': {'num_samples': 1503, 'number_of_characters': 173877, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mal_Mlym-tam_Taml': {'num_samples': 1503, 'number_of_characters': 186120, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mal_Mlym-tel_Telu': {'num_samples': 1503, 'number_of_characters': 168944, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mal_Mlym-urd_Arab': {'num_samples': 1503, 'number_of_characters': 172559, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mar_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162747, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mar_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 157151, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mar_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 163207, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mar_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 168195, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mar_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161879, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mar_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157445, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mar_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159432, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mar_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 161127, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mar_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166340, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mar_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165844, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mar_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 163571, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mar_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 174001, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mar_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158479, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mar_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 162069, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mar_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165386, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mar_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161364, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mar_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 159256, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mar_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170542, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mar_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163786, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mar_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 176029, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mar_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158853, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mar_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162468, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mni_Mtei-asm_Beng': {'num_samples': 1503, 'number_of_characters': 157316, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mni_Mtei-ben_Beng': {'num_samples': 1503, 'number_of_characters': 151720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mni_Mtei-brx_Deva': {'num_samples': 1503, 'number_of_characters': 157776, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mni_Mtei-doi_Deva': {'num_samples': 1503, 'number_of_characters': 162764, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mni_Mtei-eng_Latn': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mni_Mtei-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152014, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mni_Mtei-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mni_Mtei-hin_Deva': {'num_samples': 1503, 'number_of_characters': 155696, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mni_Mtei-kan_Knda': {'num_samples': 1503, 'number_of_characters': 160909, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mni_Mtei-kas_Arab': {'num_samples': 1503, 'number_of_characters': 160413, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mni_Mtei-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158140, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mni_Mtei-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 168570, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mni_Mtei-mar_Deva': {'num_samples': 1503, 'number_of_characters': 158479, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mni_Mtei-npi_Deva': {'num_samples': 1503, 'number_of_characters': 156638, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mni_Mtei-ory_Orya': {'num_samples': 1503, 'number_of_characters': 159955, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mni_Mtei-pan_Guru': {'num_samples': 1503, 'number_of_characters': 155933, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mni_Mtei-san_Deva': {'num_samples': 1503, 'number_of_characters': 153825, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mni_Mtei-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165111, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mni_Mtei-snd_Deva': {'num_samples': 1503, 'number_of_characters': 158355, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mni_Mtei-tam_Taml': {'num_samples': 1503, 'number_of_characters': 170598, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mni_Mtei-tel_Telu': {'num_samples': 1503, 'number_of_characters': 153422, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mni_Mtei-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157037, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'npi_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160906, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'npi_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155310, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'npi_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'npi_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166354, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'npi_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160038, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'npi_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 155604, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'npi_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157591, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'npi_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159286, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'npi_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164499, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'npi_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'npi_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'npi_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172160, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'npi_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162069, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'npi_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 156638, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'npi_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163545, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'npi_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159523, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'npi_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 157415, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'npi_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 168701, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'npi_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161945, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'npi_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174188, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'npi_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157012, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'npi_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 160627, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'ory_Orya-asm_Beng': {'num_samples': 1503, 'number_of_characters': 164223, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'ory_Orya-ben_Beng': {'num_samples': 1503, 'number_of_characters': 158627, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'ory_Orya-brx_Deva': {'num_samples': 1503, 'number_of_characters': 164683, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'ory_Orya-doi_Deva': {'num_samples': 1503, 'number_of_characters': 169671, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'ory_Orya-eng_Latn': {'num_samples': 1503, 'number_of_characters': 163355, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'ory_Orya-gom_Deva': {'num_samples': 1503, 'number_of_characters': 158921, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'ory_Orya-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 160908, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'ory_Orya-hin_Deva': {'num_samples': 1503, 'number_of_characters': 162603, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'ory_Orya-kan_Knda': {'num_samples': 1503, 'number_of_characters': 167816, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'ory_Orya-kas_Arab': {'num_samples': 1503, 'number_of_characters': 167320, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'ory_Orya-mai_Deva': {'num_samples': 1503, 'number_of_characters': 165047, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'ory_Orya-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 175477, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'ory_Orya-mar_Deva': {'num_samples': 1503, 'number_of_characters': 165386, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'ory_Orya-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 159955, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'ory_Orya-npi_Deva': {'num_samples': 1503, 'number_of_characters': 163545, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'ory_Orya-pan_Guru': {'num_samples': 1503, 'number_of_characters': 162840, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'ory_Orya-san_Deva': {'num_samples': 1503, 'number_of_characters': 160732, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'ory_Orya-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172018, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'ory_Orya-snd_Deva': {'num_samples': 1503, 'number_of_characters': 165262, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'ory_Orya-tam_Taml': {'num_samples': 1503, 'number_of_characters': 177505, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'ory_Orya-tel_Telu': {'num_samples': 1503, 'number_of_characters': 160329, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'ory_Orya-urd_Arab': {'num_samples': 1503, 'number_of_characters': 163944, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'pan_Guru-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160201, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'pan_Guru-ben_Beng': {'num_samples': 1503, 'number_of_characters': 154605, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'pan_Guru-brx_Deva': {'num_samples': 1503, 'number_of_characters': 160661, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'pan_Guru-doi_Deva': {'num_samples': 1503, 'number_of_characters': 165649, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'pan_Guru-eng_Latn': {'num_samples': 1503, 'number_of_characters': 159333, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'pan_Guru-gom_Deva': {'num_samples': 1503, 'number_of_characters': 154899, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'pan_Guru-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 156886, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'pan_Guru-hin_Deva': {'num_samples': 1503, 'number_of_characters': 158581, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'pan_Guru-kan_Knda': {'num_samples': 1503, 'number_of_characters': 163794, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'pan_Guru-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163298, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'pan_Guru-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161025, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'pan_Guru-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171455, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'pan_Guru-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161364, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'pan_Guru-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 155933, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'pan_Guru-npi_Deva': {'num_samples': 1503, 'number_of_characters': 159523, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'pan_Guru-ory_Orya': {'num_samples': 1503, 'number_of_characters': 162840, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'pan_Guru-san_Deva': {'num_samples': 1503, 'number_of_characters': 156710, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'pan_Guru-sat_Olck': {'num_samples': 1503, 'number_of_characters': 167996, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'pan_Guru-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161240, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'pan_Guru-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173483, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'pan_Guru-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156307, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'pan_Guru-urd_Arab': {'num_samples': 1503, 'number_of_characters': 159922, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'san_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 158093, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'san_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152497, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'san_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158553, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'san_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163541, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'san_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 157225, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'san_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152791, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'san_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154778, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'san_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156473, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'san_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161686, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'san_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 161190, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'san_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158917, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'san_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 169347, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'san_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 159256, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'san_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 153825, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'san_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157415, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'san_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160732, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'san_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156710, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'san_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165888, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'san_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 159132, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'san_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 171375, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'san_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 154199, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'san_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157814, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'sat_Olck-asm_Beng': {'num_samples': 1503, 'number_of_characters': 169379, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'sat_Olck-ben_Beng': {'num_samples': 1503, 'number_of_characters': 163783, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'sat_Olck-brx_Deva': {'num_samples': 1503, 'number_of_characters': 169839, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'sat_Olck-doi_Deva': {'num_samples': 1503, 'number_of_characters': 174827, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'sat_Olck-eng_Latn': {'num_samples': 1503, 'number_of_characters': 168511, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'sat_Olck-gom_Deva': {'num_samples': 1503, 'number_of_characters': 164077, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'sat_Olck-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 166064, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'sat_Olck-hin_Deva': {'num_samples': 1503, 'number_of_characters': 167759, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'sat_Olck-kan_Knda': {'num_samples': 1503, 'number_of_characters': 172972, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'sat_Olck-kas_Arab': {'num_samples': 1503, 'number_of_characters': 172476, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'sat_Olck-mai_Deva': {'num_samples': 1503, 'number_of_characters': 170203, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'sat_Olck-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 180633, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'sat_Olck-mar_Deva': {'num_samples': 1503, 'number_of_characters': 170542, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'sat_Olck-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 165111, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'sat_Olck-npi_Deva': {'num_samples': 1503, 'number_of_characters': 168701, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'sat_Olck-ory_Orya': {'num_samples': 1503, 'number_of_characters': 172018, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'sat_Olck-pan_Guru': {'num_samples': 1503, 'number_of_characters': 167996, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'sat_Olck-san_Deva': {'num_samples': 1503, 'number_of_characters': 165888, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'sat_Olck-snd_Deva': {'num_samples': 1503, 'number_of_characters': 170418, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'sat_Olck-tam_Taml': {'num_samples': 1503, 'number_of_characters': 182661, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'sat_Olck-tel_Telu': {'num_samples': 1503, 'number_of_characters': 165485, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'sat_Olck-urd_Arab': {'num_samples': 1503, 'number_of_characters': 169100, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'snd_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162623, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'snd_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 157027, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'snd_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 163083, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'snd_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 168071, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'snd_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161755, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'snd_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157321, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'snd_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159308, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'snd_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 161003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'snd_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166216, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'snd_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'snd_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 163447, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'snd_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173877, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'snd_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163786, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'snd_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158355, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'snd_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161945, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'snd_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165262, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'snd_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161240, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'snd_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 159132, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'snd_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170418, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'snd_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175905, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'snd_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'snd_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162344, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'tam_Taml-asm_Beng': {'num_samples': 1503, 'number_of_characters': 174866, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'tam_Taml-ben_Beng': {'num_samples': 1503, 'number_of_characters': 169270, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'tam_Taml-brx_Deva': {'num_samples': 1503, 'number_of_characters': 175326, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'tam_Taml-doi_Deva': {'num_samples': 1503, 'number_of_characters': 180314, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'tam_Taml-eng_Latn': {'num_samples': 1503, 'number_of_characters': 173998, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'tam_Taml-gom_Deva': {'num_samples': 1503, 'number_of_characters': 169564, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'tam_Taml-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 171551, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'tam_Taml-hin_Deva': {'num_samples': 1503, 'number_of_characters': 173246, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'tam_Taml-kan_Knda': {'num_samples': 1503, 'number_of_characters': 178459, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'tam_Taml-kas_Arab': {'num_samples': 1503, 'number_of_characters': 177963, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'tam_Taml-mai_Deva': {'num_samples': 1503, 'number_of_characters': 175690, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'tam_Taml-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 186120, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'tam_Taml-mar_Deva': {'num_samples': 1503, 'number_of_characters': 176029, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'tam_Taml-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 170598, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'tam_Taml-npi_Deva': {'num_samples': 1503, 'number_of_characters': 174188, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'tam_Taml-ory_Orya': {'num_samples': 1503, 'number_of_characters': 177505, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'tam_Taml-pan_Guru': {'num_samples': 1503, 'number_of_characters': 173483, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'tam_Taml-san_Deva': {'num_samples': 1503, 'number_of_characters': 171375, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'tam_Taml-sat_Olck': {'num_samples': 1503, 'number_of_characters': 182661, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'tam_Taml-snd_Deva': {'num_samples': 1503, 'number_of_characters': 175905, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'tam_Taml-tel_Telu': {'num_samples': 1503, 'number_of_characters': 170972, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'tam_Taml-urd_Arab': {'num_samples': 1503, 'number_of_characters': 174587, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'tel_Telu-asm_Beng': {'num_samples': 1503, 'number_of_characters': 157690, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'tel_Telu-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152094, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'tel_Telu-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158150, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'tel_Telu-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163138, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'tel_Telu-eng_Latn': {'num_samples': 1503, 'number_of_characters': 156822, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'tel_Telu-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152388, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'tel_Telu-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154375, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'tel_Telu-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156070, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'tel_Telu-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161283, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'tel_Telu-kas_Arab': {'num_samples': 1503, 'number_of_characters': 160787, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'tel_Telu-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158514, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'tel_Telu-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 168944, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'tel_Telu-mar_Deva': {'num_samples': 1503, 'number_of_characters': 158853, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'tel_Telu-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 153422, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'tel_Telu-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157012, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'tel_Telu-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160329, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'tel_Telu-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156307, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'tel_Telu-san_Deva': {'num_samples': 1503, 'number_of_characters': 154199, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'tel_Telu-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165485, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'tel_Telu-snd_Deva': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'tel_Telu-tam_Taml': {'num_samples': 1503, 'number_of_characters': 170972, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'tel_Telu-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157411, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'urd_Arab-asm_Beng': {'num_samples': 1503, 'number_of_characters': 161305, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'urd_Arab-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155709, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'urd_Arab-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161765, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'urd_Arab-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166753, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'urd_Arab-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160437, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'urd_Arab-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'urd_Arab-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157990, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'urd_Arab-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159685, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'urd_Arab-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164898, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'urd_Arab-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164402, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'urd_Arab-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162129, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'urd_Arab-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172559, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'urd_Arab-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162468, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'urd_Arab-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157037, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'urd_Arab-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160627, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'urd_Arab-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163944, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'urd_Arab-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159922, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'urd_Arab-san_Deva': {'num_samples': 1503, 'number_of_characters': 157814, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'urd_Arab-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169100, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'urd_Arab-snd_Deva': {'num_samples': 1503, 'number_of_characters': 162344, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'urd_Arab-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174587, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'urd_Arab-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157411, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}}}} |
| [IN22GenBitextMining](https://huggingface.co/datasets/ai4bharat/IN22-Gen) (Jay Gala, 2023) | ['asm', 'ben', 'brx', 'doi', 'eng', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | BitextMining | s2s | [Government, Legal, News, Non-fiction, Religious, Web, Written] | {'test': 518144} | {'test': {'num_samples': 518144, 'number_of_characters': 162367876, 'unique_pairs': 518101, 'min_sentence1_length': 9, 'average_sentence1_length': 156.68, 'max_sentence1_length': 692, 'unique_sentence1': 23550, 'min_sentence2_length': 9, 'average_sentence2_length': 156.68, 'max_sentence2_length': 692, 'unique_sentence2': 23550, 'hf_subset_descriptive_stats': {'asm_Beng-ben_Beng': {'num_samples': 1024, 'number_of_characters': 310622, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'asm_Beng-brx_Deva': {'num_samples': 1024, 'number_of_characters': 323609, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'asm_Beng-doi_Deva': {'num_samples': 1024, 'number_of_characters': 319020, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'asm_Beng-eng_Latn': {'num_samples': 1024, 'number_of_characters': 320098, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'asm_Beng-gom_Deva': {'num_samples': 1024, 'number_of_characters': 312594, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'asm_Beng-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 309440, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'asm_Beng-hin_Deva': {'num_samples': 1024, 'number_of_characters': 320106, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'asm_Beng-kan_Knda': {'num_samples': 1024, 'number_of_characters': 332064, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'asm_Beng-kas_Arab': {'num_samples': 1024, 'number_of_characters': 322764, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'asm_Beng-mai_Deva': {'num_samples': 1024, 'number_of_characters': 308682, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'asm_Beng-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 343636, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'asm_Beng-mar_Deva': {'num_samples': 1024, 'number_of_characters': 321784, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'asm_Beng-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 313134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'asm_Beng-npi_Deva': {'num_samples': 1024, 'number_of_characters': 313419, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'asm_Beng-ory_Orya': {'num_samples': 1024, 'number_of_characters': 334226, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'asm_Beng-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306863, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'asm_Beng-san_Deva': {'num_samples': 1024, 'number_of_characters': 318079, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'asm_Beng-sat_Olck': {'num_samples': 1024, 'number_of_characters': 326732, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'asm_Beng-snd_Deva': {'num_samples': 1024, 'number_of_characters': 320421, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'asm_Beng-tam_Taml': {'num_samples': 1024, 'number_of_characters': 348346, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'asm_Beng-tel_Telu': {'num_samples': 1024, 'number_of_characters': 319045, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'asm_Beng-urd_Arab': {'num_samples': 1024, 'number_of_characters': 315134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'ben_Beng-asm_Beng': {'num_samples': 1024, 'number_of_characters': 310622, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'ben_Beng-brx_Deva': {'num_samples': 1024, 'number_of_characters': 313313, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'ben_Beng-doi_Deva': {'num_samples': 1024, 'number_of_characters': 308724, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'ben_Beng-eng_Latn': {'num_samples': 1024, 'number_of_characters': 309802, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'ben_Beng-gom_Deva': {'num_samples': 1024, 'number_of_characters': 302298, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'ben_Beng-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 299144, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'ben_Beng-hin_Deva': {'num_samples': 1024, 'number_of_characters': 309810, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'ben_Beng-kan_Knda': {'num_samples': 1024, 'number_of_characters': 321768, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'ben_Beng-kas_Arab': {'num_samples': 1024, 'number_of_characters': 312468, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'ben_Beng-mai_Deva': {'num_samples': 1024, 'number_of_characters': 298386, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'ben_Beng-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 333340, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'ben_Beng-mar_Deva': {'num_samples': 1024, 'number_of_characters': 311488, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'ben_Beng-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 302838, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'ben_Beng-npi_Deva': {'num_samples': 1024, 'number_of_characters': 303123, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'ben_Beng-ory_Orya': {'num_samples': 1024, 'number_of_characters': 323930, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'ben_Beng-pan_Guru': {'num_samples': 1024, 'number_of_characters': 296567, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'ben_Beng-san_Deva': {'num_samples': 1024, 'number_of_characters': 307783, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'ben_Beng-sat_Olck': {'num_samples': 1024, 'number_of_characters': 316436, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'ben_Beng-snd_Deva': {'num_samples': 1024, 'number_of_characters': 310125, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'ben_Beng-tam_Taml': {'num_samples': 1024, 'number_of_characters': 338050, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'ben_Beng-tel_Telu': {'num_samples': 1024, 'number_of_characters': 308749, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'ben_Beng-urd_Arab': {'num_samples': 1024, 'number_of_characters': 304838, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'brx_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 323609, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'brx_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 313313, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'brx_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 321711, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'brx_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 322789, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'brx_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 315285, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'brx_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 312131, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'brx_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 322797, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'brx_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 334755, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'brx_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 325455, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'brx_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 311373, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'brx_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 346327, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'brx_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 324475, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'brx_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 315825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'brx_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 316110, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'brx_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 336917, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'brx_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 309554, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'brx_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 320770, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'brx_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 329423, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'brx_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 323112, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'brx_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 351037, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'brx_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 321736, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'brx_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 317825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'doi_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 319020, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'doi_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 308724, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'doi_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 321711, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'doi_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 318200, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'doi_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 310696, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'doi_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 307542, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'doi_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 318208, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'doi_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 330166, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'doi_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 320866, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'doi_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 306784, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'doi_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 341738, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'doi_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 319886, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'doi_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 311236, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'doi_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 311521, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'doi_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 332328, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'doi_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304965, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'doi_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 316181, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'doi_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 324834, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'doi_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 318523, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'doi_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 346448, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'doi_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 317147, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'doi_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 313236, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'eng_Latn-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320098, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'eng_Latn-ben_Beng': {'num_samples': 1024, 'number_of_characters': 309802, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'eng_Latn-brx_Deva': {'num_samples': 1024, 'number_of_characters': 322789, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'eng_Latn-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318200, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'eng_Latn-gom_Deva': {'num_samples': 1024, 'number_of_characters': 311774, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'eng_Latn-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308620, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'eng_Latn-hin_Deva': {'num_samples': 1024, 'number_of_characters': 319286, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'eng_Latn-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331244, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'eng_Latn-kas_Arab': {'num_samples': 1024, 'number_of_characters': 321944, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'eng_Latn-mai_Deva': {'num_samples': 1024, 'number_of_characters': 307862, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'eng_Latn-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 342816, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'eng_Latn-mar_Deva': {'num_samples': 1024, 'number_of_characters': 320964, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'eng_Latn-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312314, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'eng_Latn-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312599, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'eng_Latn-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333406, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'eng_Latn-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306043, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'eng_Latn-san_Deva': {'num_samples': 1024, 'number_of_characters': 317259, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'eng_Latn-sat_Olck': {'num_samples': 1024, 'number_of_characters': 325912, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'eng_Latn-snd_Deva': {'num_samples': 1024, 'number_of_characters': 319601, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'eng_Latn-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347526, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'eng_Latn-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318225, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'eng_Latn-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314314, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'gom_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 312594, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'gom_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 302298, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'gom_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 315285, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'gom_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 310696, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'gom_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 311774, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'gom_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301116, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'gom_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 311782, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'gom_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 323740, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'gom_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 314440, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'gom_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 300358, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'gom_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 335312, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'gom_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 313460, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'gom_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 304810, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'gom_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 305095, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'gom_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 325902, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'gom_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 298539, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'gom_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 309755, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'gom_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 318408, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'gom_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312097, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'gom_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340022, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'gom_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 310721, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'gom_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 306810, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'guj_Gujr-asm_Beng': {'num_samples': 1024, 'number_of_characters': 309440, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'guj_Gujr-ben_Beng': {'num_samples': 1024, 'number_of_characters': 299144, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'guj_Gujr-brx_Deva': {'num_samples': 1024, 'number_of_characters': 312131, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'guj_Gujr-doi_Deva': {'num_samples': 1024, 'number_of_characters': 307542, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'guj_Gujr-eng_Latn': {'num_samples': 1024, 'number_of_characters': 308620, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'guj_Gujr-gom_Deva': {'num_samples': 1024, 'number_of_characters': 301116, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'guj_Gujr-hin_Deva': {'num_samples': 1024, 'number_of_characters': 308628, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'guj_Gujr-kan_Knda': {'num_samples': 1024, 'number_of_characters': 320586, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'guj_Gujr-kas_Arab': {'num_samples': 1024, 'number_of_characters': 311286, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'guj_Gujr-mai_Deva': {'num_samples': 1024, 'number_of_characters': 297204, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'guj_Gujr-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 332158, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'guj_Gujr-mar_Deva': {'num_samples': 1024, 'number_of_characters': 310306, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'guj_Gujr-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 301656, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'guj_Gujr-npi_Deva': {'num_samples': 1024, 'number_of_characters': 301941, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'guj_Gujr-ory_Orya': {'num_samples': 1024, 'number_of_characters': 322748, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'guj_Gujr-pan_Guru': {'num_samples': 1024, 'number_of_characters': 295385, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'guj_Gujr-san_Deva': {'num_samples': 1024, 'number_of_characters': 306601, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'guj_Gujr-sat_Olck': {'num_samples': 1024, 'number_of_characters': 315254, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'guj_Gujr-snd_Deva': {'num_samples': 1024, 'number_of_characters': 308943, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'guj_Gujr-tam_Taml': {'num_samples': 1024, 'number_of_characters': 336868, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'guj_Gujr-tel_Telu': {'num_samples': 1024, 'number_of_characters': 307567, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'guj_Gujr-urd_Arab': {'num_samples': 1024, 'number_of_characters': 303656, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'hin_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320106, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'hin_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 309810, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'hin_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 322797, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'hin_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318208, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'hin_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 319286, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'hin_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 311782, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'hin_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308628, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'hin_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331252, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'hin_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 321952, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'hin_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 307870, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'hin_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 342824, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'hin_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 320972, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'hin_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312322, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'hin_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312607, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'hin_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333414, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'hin_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306051, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'hin_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 317267, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'hin_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 325920, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'hin_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 319609, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'hin_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347534, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'hin_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318233, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'hin_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314322, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'kan_Knda-asm_Beng': {'num_samples': 1024, 'number_of_characters': 332064, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'kan_Knda-ben_Beng': {'num_samples': 1024, 'number_of_characters': 321768, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'kan_Knda-brx_Deva': {'num_samples': 1024, 'number_of_characters': 334755, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'kan_Knda-doi_Deva': {'num_samples': 1024, 'number_of_characters': 330166, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'kan_Knda-eng_Latn': {'num_samples': 1024, 'number_of_characters': 331244, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'kan_Knda-gom_Deva': {'num_samples': 1024, 'number_of_characters': 323740, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'kan_Knda-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 320586, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'kan_Knda-hin_Deva': {'num_samples': 1024, 'number_of_characters': 331252, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'kan_Knda-kas_Arab': {'num_samples': 1024, 'number_of_characters': 333910, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'kan_Knda-mai_Deva': {'num_samples': 1024, 'number_of_characters': 319828, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'kan_Knda-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 354782, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'kan_Knda-mar_Deva': {'num_samples': 1024, 'number_of_characters': 332930, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'kan_Knda-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 324280, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'kan_Knda-npi_Deva': {'num_samples': 1024, 'number_of_characters': 324565, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'kan_Knda-ory_Orya': {'num_samples': 1024, 'number_of_characters': 345372, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'kan_Knda-pan_Guru': {'num_samples': 1024, 'number_of_characters': 318009, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'kan_Knda-san_Deva': {'num_samples': 1024, 'number_of_characters': 329225, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'kan_Knda-sat_Olck': {'num_samples': 1024, 'number_of_characters': 337878, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'kan_Knda-snd_Deva': {'num_samples': 1024, 'number_of_characters': 331567, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'kan_Knda-tam_Taml': {'num_samples': 1024, 'number_of_characters': 359492, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'kan_Knda-tel_Telu': {'num_samples': 1024, 'number_of_characters': 330191, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'kan_Knda-urd_Arab': {'num_samples': 1024, 'number_of_characters': 326280, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'kas_Arab-asm_Beng': {'num_samples': 1024, 'number_of_characters': 322764, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'kas_Arab-ben_Beng': {'num_samples': 1024, 'number_of_characters': 312468, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'kas_Arab-brx_Deva': {'num_samples': 1024, 'number_of_characters': 325455, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'kas_Arab-doi_Deva': {'num_samples': 1024, 'number_of_characters': 320866, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'kas_Arab-eng_Latn': {'num_samples': 1024, 'number_of_characters': 321944, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'kas_Arab-gom_Deva': {'num_samples': 1024, 'number_of_characters': 314440, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'kas_Arab-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 311286, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'kas_Arab-hin_Deva': {'num_samples': 1024, 'number_of_characters': 321952, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'kas_Arab-kan_Knda': {'num_samples': 1024, 'number_of_characters': 333910, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'kas_Arab-mai_Deva': {'num_samples': 1024, 'number_of_characters': 310528, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'kas_Arab-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 345482, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'kas_Arab-mar_Deva': {'num_samples': 1024, 'number_of_characters': 323630, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'kas_Arab-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 314980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'kas_Arab-npi_Deva': {'num_samples': 1024, 'number_of_characters': 315265, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'kas_Arab-ory_Orya': {'num_samples': 1024, 'number_of_characters': 336072, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'kas_Arab-pan_Guru': {'num_samples': 1024, 'number_of_characters': 308709, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'kas_Arab-san_Deva': {'num_samples': 1024, 'number_of_characters': 319925, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'kas_Arab-sat_Olck': {'num_samples': 1024, 'number_of_characters': 328578, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'kas_Arab-snd_Deva': {'num_samples': 1024, 'number_of_characters': 322267, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'kas_Arab-tam_Taml': {'num_samples': 1024, 'number_of_characters': 350192, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'kas_Arab-tel_Telu': {'num_samples': 1024, 'number_of_characters': 320891, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'kas_Arab-urd_Arab': {'num_samples': 1024, 'number_of_characters': 316980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mai_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 308682, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mai_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 298386, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mai_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 311373, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mai_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 306784, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mai_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 307862, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mai_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 300358, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mai_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 297204, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mai_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 307870, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mai_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 319828, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mai_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 310528, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mai_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 331400, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mai_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 309548, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mai_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 300898, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mai_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 301183, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mai_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 321990, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mai_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 294627, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mai_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 305843, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mai_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 314496, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mai_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 308185, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mai_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 336110, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mai_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 306809, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mai_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 302898, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mal_Mlym-asm_Beng': {'num_samples': 1024, 'number_of_characters': 343636, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mal_Mlym-ben_Beng': {'num_samples': 1024, 'number_of_characters': 333340, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mal_Mlym-brx_Deva': {'num_samples': 1024, 'number_of_characters': 346327, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mal_Mlym-doi_Deva': {'num_samples': 1024, 'number_of_characters': 341738, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mal_Mlym-eng_Latn': {'num_samples': 1024, 'number_of_characters': 342816, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mal_Mlym-gom_Deva': {'num_samples': 1024, 'number_of_characters': 335312, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mal_Mlym-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 332158, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mal_Mlym-hin_Deva': {'num_samples': 1024, 'number_of_characters': 342824, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mal_Mlym-kan_Knda': {'num_samples': 1024, 'number_of_characters': 354782, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mal_Mlym-kas_Arab': {'num_samples': 1024, 'number_of_characters': 345482, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mal_Mlym-mai_Deva': {'num_samples': 1024, 'number_of_characters': 331400, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mal_Mlym-mar_Deva': {'num_samples': 1024, 'number_of_characters': 344502, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mal_Mlym-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 335852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mal_Mlym-npi_Deva': {'num_samples': 1024, 'number_of_characters': 336137, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mal_Mlym-ory_Orya': {'num_samples': 1024, 'number_of_characters': 356944, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mal_Mlym-pan_Guru': {'num_samples': 1024, 'number_of_characters': 329581, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mal_Mlym-san_Deva': {'num_samples': 1024, 'number_of_characters': 340797, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mal_Mlym-sat_Olck': {'num_samples': 1024, 'number_of_characters': 349450, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mal_Mlym-snd_Deva': {'num_samples': 1024, 'number_of_characters': 343139, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mal_Mlym-tam_Taml': {'num_samples': 1024, 'number_of_characters': 371064, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mal_Mlym-tel_Telu': {'num_samples': 1024, 'number_of_characters': 341763, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mal_Mlym-urd_Arab': {'num_samples': 1024, 'number_of_characters': 337852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mar_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 321784, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mar_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 311488, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mar_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 324475, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mar_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 319886, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mar_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 320964, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mar_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 313460, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mar_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 310306, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mar_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 320972, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mar_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 332930, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mar_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 323630, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mar_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 309548, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mar_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 344502, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mar_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 314000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mar_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 314285, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mar_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 335092, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mar_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 307729, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mar_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 318945, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mar_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 327598, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mar_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 321287, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mar_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 349212, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mar_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 319911, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mar_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 316000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mni_Mtei-asm_Beng': {'num_samples': 1024, 'number_of_characters': 313134, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mni_Mtei-ben_Beng': {'num_samples': 1024, 'number_of_characters': 302838, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mni_Mtei-brx_Deva': {'num_samples': 1024, 'number_of_characters': 315825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mni_Mtei-doi_Deva': {'num_samples': 1024, 'number_of_characters': 311236, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mni_Mtei-eng_Latn': {'num_samples': 1024, 'number_of_characters': 312314, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mni_Mtei-gom_Deva': {'num_samples': 1024, 'number_of_characters': 304810, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mni_Mtei-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301656, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mni_Mtei-hin_Deva': {'num_samples': 1024, 'number_of_characters': 312322, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mni_Mtei-kan_Knda': {'num_samples': 1024, 'number_of_characters': 324280, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mni_Mtei-kas_Arab': {'num_samples': 1024, 'number_of_characters': 314980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mni_Mtei-mai_Deva': {'num_samples': 1024, 'number_of_characters': 300898, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mni_Mtei-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 335852, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mni_Mtei-mar_Deva': {'num_samples': 1024, 'number_of_characters': 314000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mni_Mtei-npi_Deva': {'num_samples': 1024, 'number_of_characters': 305635, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mni_Mtei-ory_Orya': {'num_samples': 1024, 'number_of_characters': 326442, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mni_Mtei-pan_Guru': {'num_samples': 1024, 'number_of_characters': 299079, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mni_Mtei-san_Deva': {'num_samples': 1024, 'number_of_characters': 310295, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mni_Mtei-sat_Olck': {'num_samples': 1024, 'number_of_characters': 318948, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mni_Mtei-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312637, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mni_Mtei-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340562, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mni_Mtei-tel_Telu': {'num_samples': 1024, 'number_of_characters': 311261, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mni_Mtei-urd_Arab': {'num_samples': 1024, 'number_of_characters': 307350, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'npi_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 313419, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'npi_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 303123, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'npi_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 316110, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'npi_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 311521, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'npi_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 312599, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'npi_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 305095, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'npi_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301941, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'npi_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 312607, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'npi_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 324565, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'npi_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 315265, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'npi_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 301183, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'npi_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 336137, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'npi_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 314285, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'npi_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 305635, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'npi_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 326727, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'npi_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 299364, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'npi_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 310580, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'npi_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 319233, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'npi_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312922, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'npi_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340847, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'npi_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 311546, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'npi_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 307635, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'ory_Orya-asm_Beng': {'num_samples': 1024, 'number_of_characters': 334226, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'ory_Orya-ben_Beng': {'num_samples': 1024, 'number_of_characters': 323930, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'ory_Orya-brx_Deva': {'num_samples': 1024, 'number_of_characters': 336917, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'ory_Orya-doi_Deva': {'num_samples': 1024, 'number_of_characters': 332328, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'ory_Orya-eng_Latn': {'num_samples': 1024, 'number_of_characters': 333406, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'ory_Orya-gom_Deva': {'num_samples': 1024, 'number_of_characters': 325902, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'ory_Orya-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 322748, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'ory_Orya-hin_Deva': {'num_samples': 1024, 'number_of_characters': 333414, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'ory_Orya-kan_Knda': {'num_samples': 1024, 'number_of_characters': 345372, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'ory_Orya-kas_Arab': {'num_samples': 1024, 'number_of_characters': 336072, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'ory_Orya-mai_Deva': {'num_samples': 1024, 'number_of_characters': 321990, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'ory_Orya-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 356944, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'ory_Orya-mar_Deva': {'num_samples': 1024, 'number_of_characters': 335092, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'ory_Orya-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 326442, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'ory_Orya-npi_Deva': {'num_samples': 1024, 'number_of_characters': 326727, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'ory_Orya-pan_Guru': {'num_samples': 1024, 'number_of_characters': 320171, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'ory_Orya-san_Deva': {'num_samples': 1024, 'number_of_characters': 331387, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'ory_Orya-sat_Olck': {'num_samples': 1024, 'number_of_characters': 340040, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'ory_Orya-snd_Deva': {'num_samples': 1024, 'number_of_characters': 333729, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'ory_Orya-tam_Taml': {'num_samples': 1024, 'number_of_characters': 361654, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'ory_Orya-tel_Telu': {'num_samples': 1024, 'number_of_characters': 332353, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'ory_Orya-urd_Arab': {'num_samples': 1024, 'number_of_characters': 328442, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'pan_Guru-asm_Beng': {'num_samples': 1024, 'number_of_characters': 306863, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'pan_Guru-ben_Beng': {'num_samples': 1024, 'number_of_characters': 296567, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'pan_Guru-brx_Deva': {'num_samples': 1024, 'number_of_characters': 309554, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'pan_Guru-doi_Deva': {'num_samples': 1024, 'number_of_characters': 304965, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'pan_Guru-eng_Latn': {'num_samples': 1024, 'number_of_characters': 306043, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'pan_Guru-gom_Deva': {'num_samples': 1024, 'number_of_characters': 298539, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'pan_Guru-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 295385, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'pan_Guru-hin_Deva': {'num_samples': 1024, 'number_of_characters': 306051, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'pan_Guru-kan_Knda': {'num_samples': 1024, 'number_of_characters': 318009, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'pan_Guru-kas_Arab': {'num_samples': 1024, 'number_of_characters': 308709, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'pan_Guru-mai_Deva': {'num_samples': 1024, 'number_of_characters': 294627, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'pan_Guru-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 329581, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'pan_Guru-mar_Deva': {'num_samples': 1024, 'number_of_characters': 307729, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'pan_Guru-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 299079, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'pan_Guru-npi_Deva': {'num_samples': 1024, 'number_of_characters': 299364, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'pan_Guru-ory_Orya': {'num_samples': 1024, 'number_of_characters': 320171, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'pan_Guru-san_Deva': {'num_samples': 1024, 'number_of_characters': 304024, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'pan_Guru-sat_Olck': {'num_samples': 1024, 'number_of_characters': 312677, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'pan_Guru-snd_Deva': {'num_samples': 1024, 'number_of_characters': 306366, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'pan_Guru-tam_Taml': {'num_samples': 1024, 'number_of_characters': 334291, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'pan_Guru-tel_Telu': {'num_samples': 1024, 'number_of_characters': 304990, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'pan_Guru-urd_Arab': {'num_samples': 1024, 'number_of_characters': 301079, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'san_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 318079, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'san_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 307783, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'san_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 320770, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'san_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 316181, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'san_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 317259, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'san_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 309755, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'san_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 306601, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'san_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 317267, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'san_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 329225, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'san_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 319925, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'san_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 305843, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'san_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 340797, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'san_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 318945, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'san_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 310295, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'san_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 310580, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'san_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 331387, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'san_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304024, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'san_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 323893, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'san_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 317582, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'san_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 345507, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'san_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 316206, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'san_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 312295, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'sat_Olck-asm_Beng': {'num_samples': 1024, 'number_of_characters': 326732, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'sat_Olck-ben_Beng': {'num_samples': 1024, 'number_of_characters': 316436, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'sat_Olck-brx_Deva': {'num_samples': 1024, 'number_of_characters': 329423, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'sat_Olck-doi_Deva': {'num_samples': 1024, 'number_of_characters': 324834, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'sat_Olck-eng_Latn': {'num_samples': 1024, 'number_of_characters': 325912, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'sat_Olck-gom_Deva': {'num_samples': 1024, 'number_of_characters': 318408, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'sat_Olck-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 315254, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'sat_Olck-hin_Deva': {'num_samples': 1024, 'number_of_characters': 325920, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'sat_Olck-kan_Knda': {'num_samples': 1024, 'number_of_characters': 337878, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'sat_Olck-kas_Arab': {'num_samples': 1024, 'number_of_characters': 328578, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'sat_Olck-mai_Deva': {'num_samples': 1024, 'number_of_characters': 314496, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'sat_Olck-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 349450, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'sat_Olck-mar_Deva': {'num_samples': 1024, 'number_of_characters': 327598, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'sat_Olck-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 318948, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'sat_Olck-npi_Deva': {'num_samples': 1024, 'number_of_characters': 319233, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'sat_Olck-ory_Orya': {'num_samples': 1024, 'number_of_characters': 340040, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'sat_Olck-pan_Guru': {'num_samples': 1024, 'number_of_characters': 312677, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'sat_Olck-san_Deva': {'num_samples': 1024, 'number_of_characters': 323893, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'sat_Olck-snd_Deva': {'num_samples': 1024, 'number_of_characters': 326235, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'sat_Olck-tam_Taml': {'num_samples': 1024, 'number_of_characters': 354160, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'sat_Olck-tel_Telu': {'num_samples': 1024, 'number_of_characters': 324859, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'sat_Olck-urd_Arab': {'num_samples': 1024, 'number_of_characters': 320948, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'snd_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320421, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'snd_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 310125, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'snd_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 323112, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'snd_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318523, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'snd_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 319601, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'snd_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 312097, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'snd_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308943, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'snd_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 319609, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'snd_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331567, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'snd_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 322267, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'snd_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 308185, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'snd_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 343139, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'snd_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 321287, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'snd_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312637, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'snd_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312922, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'snd_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333729, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'snd_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306366, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'snd_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 317582, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'snd_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 326235, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'snd_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347849, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'snd_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318548, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'snd_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314637, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'tam_Taml-asm_Beng': {'num_samples': 1024, 'number_of_characters': 348346, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'tam_Taml-ben_Beng': {'num_samples': 1024, 'number_of_characters': 338050, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'tam_Taml-brx_Deva': {'num_samples': 1024, 'number_of_characters': 351037, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'tam_Taml-doi_Deva': {'num_samples': 1024, 'number_of_characters': 346448, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'tam_Taml-eng_Latn': {'num_samples': 1024, 'number_of_characters': 347526, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'tam_Taml-gom_Deva': {'num_samples': 1024, 'number_of_characters': 340022, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'tam_Taml-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 336868, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'tam_Taml-hin_Deva': {'num_samples': 1024, 'number_of_characters': 347534, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'tam_Taml-kan_Knda': {'num_samples': 1024, 'number_of_characters': 359492, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'tam_Taml-kas_Arab': {'num_samples': 1024, 'number_of_characters': 350192, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'tam_Taml-mai_Deva': {'num_samples': 1024, 'number_of_characters': 336110, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'tam_Taml-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 371064, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'tam_Taml-mar_Deva': {'num_samples': 1024, 'number_of_characters': 349212, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'tam_Taml-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 340562, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'tam_Taml-npi_Deva': {'num_samples': 1024, 'number_of_characters': 340847, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'tam_Taml-ory_Orya': {'num_samples': 1024, 'number_of_characters': 361654, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'tam_Taml-pan_Guru': {'num_samples': 1024, 'number_of_characters': 334291, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'tam_Taml-san_Deva': {'num_samples': 1024, 'number_of_characters': 345507, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'tam_Taml-sat_Olck': {'num_samples': 1024, 'number_of_characters': 354160, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'tam_Taml-snd_Deva': {'num_samples': 1024, 'number_of_characters': 347849, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'tam_Taml-tel_Telu': {'num_samples': 1024, 'number_of_characters': 346473, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'tam_Taml-urd_Arab': {'num_samples': 1024, 'number_of_characters': 342562, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'tel_Telu-asm_Beng': {'num_samples': 1024, 'number_of_characters': 319045, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'tel_Telu-ben_Beng': {'num_samples': 1024, 'number_of_characters': 308749, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'tel_Telu-brx_Deva': {'num_samples': 1024, 'number_of_characters': 321736, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'tel_Telu-doi_Deva': {'num_samples': 1024, 'number_of_characters': 317147, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'tel_Telu-eng_Latn': {'num_samples': 1024, 'number_of_characters': 318225, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'tel_Telu-gom_Deva': {'num_samples': 1024, 'number_of_characters': 310721, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'tel_Telu-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 307567, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'tel_Telu-hin_Deva': {'num_samples': 1024, 'number_of_characters': 318233, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'tel_Telu-kan_Knda': {'num_samples': 1024, 'number_of_characters': 330191, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'tel_Telu-kas_Arab': {'num_samples': 1024, 'number_of_characters': 320891, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'tel_Telu-mai_Deva': {'num_samples': 1024, 'number_of_characters': 306809, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'tel_Telu-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 341763, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'tel_Telu-mar_Deva': {'num_samples': 1024, 'number_of_characters': 319911, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'tel_Telu-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 311261, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'tel_Telu-npi_Deva': {'num_samples': 1024, 'number_of_characters': 311546, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'tel_Telu-ory_Orya': {'num_samples': 1024, 'number_of_characters': 332353, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'tel_Telu-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304990, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'tel_Telu-san_Deva': {'num_samples': 1024, 'number_of_characters': 316206, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'tel_Telu-sat_Olck': {'num_samples': 1024, 'number_of_characters': 324859, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'tel_Telu-snd_Deva': {'num_samples': 1024, 'number_of_characters': 318548, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'tel_Telu-tam_Taml': {'num_samples': 1024, 'number_of_characters': 346473, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'tel_Telu-urd_Arab': {'num_samples': 1024, 'number_of_characters': 313261, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'urd_Arab-asm_Beng': {'num_samples': 1024, 'number_of_characters': 315134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'urd_Arab-ben_Beng': {'num_samples': 1024, 'number_of_characters': 304838, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'urd_Arab-brx_Deva': {'num_samples': 1024, 'number_of_characters': 317825, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'urd_Arab-doi_Deva': {'num_samples': 1024, 'number_of_characters': 313236, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'urd_Arab-eng_Latn': {'num_samples': 1024, 'number_of_characters': 314314, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'urd_Arab-gom_Deva': {'num_samples': 1024, 'number_of_characters': 306810, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'urd_Arab-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 303656, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'urd_Arab-hin_Deva': {'num_samples': 1024, 'number_of_characters': 314322, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'urd_Arab-kan_Knda': {'num_samples': 1024, 'number_of_characters': 326280, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'urd_Arab-kas_Arab': {'num_samples': 1024, 'number_of_characters': 316980, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'urd_Arab-mai_Deva': {'num_samples': 1024, 'number_of_characters': 302898, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'urd_Arab-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 337852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'urd_Arab-mar_Deva': {'num_samples': 1024, 'number_of_characters': 316000, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'urd_Arab-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 307350, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'urd_Arab-npi_Deva': {'num_samples': 1024, 'number_of_characters': 307635, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'urd_Arab-ory_Orya': {'num_samples': 1024, 'number_of_characters': 328442, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'urd_Arab-pan_Guru': {'num_samples': 1024, 'number_of_characters': 301079, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'urd_Arab-san_Deva': {'num_samples': 1024, 'number_of_characters': 312295, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'urd_Arab-sat_Olck': {'num_samples': 1024, 'number_of_characters': 320948, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'urd_Arab-snd_Deva': {'num_samples': 1024, 'number_of_characters': 314637, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'urd_Arab-tam_Taml': {'num_samples': 1024, 'number_of_characters': 342562, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'urd_Arab-tel_Telu': {'num_samples': 1024, 'number_of_characters': 313261, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}}}} |
| [IWSLT2017BitextMining](https://aclanthology.org/2017.iwslt-1.1/) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'jpn', 'kor', 'nld', 'ron'] | BitextMining | s2s | [Fiction, Non-fiction, Written] | {'validation': 21938} | {'validation': {'num_samples': 21938, 'number_of_characters': 4256244, 'unique_pairs': 21840, 'min_sentence1_length': 2, 'average_sentence1_length': 97.01, 'max_sentence1_length': 521, 'unique_sentence1': 11563, 'min_sentence2_length': 2, 'average_sentence2_length': 97.01, 'max_sentence2_length': 521, 'unique_sentence2': 11563, 'hf_subset_descriptive_stats': {'ar-en': {'num_samples': 888, 'number_of_characters': 172499, 'unique_pairs': 887, 'min_sentence1_length': 4, 'average_sentence1_length': 85.49, 'max_sentence1_length': 369, 'unique_sentence1': 887, 'min_sentence2_length': 10, 'average_sentence2_length': 108.77, 'max_sentence2_length': 462, 'unique_sentence2': 881}, 'de-en': {'num_samples': 888, 'number_of_characters': 202336, 'unique_pairs': 883, 'min_sentence1_length': 6, 'average_sentence1_length': 119.03, 'max_sentence1_length': 521, 'unique_sentence1': 881, 'min_sentence2_length': 10, 'average_sentence2_length': 108.83, 'max_sentence2_length': 462, 'unique_sentence2': 881}, 'en-ar': {'num_samples': 888, 'number_of_characters': 172499, 'unique_pairs': 887, 'min_sentence1_length': 10, 'average_sentence1_length': 108.77, 'max_sentence1_length': 462, 'unique_sentence1': 881, 'min_sentence2_length': 4, 'average_sentence2_length': 85.49, 'max_sentence2_length': 369, 'unique_sentence2': 887}, 'en-de': {'num_samples': 888, 'number_of_characters': 202336, 'unique_pairs': 883, 'min_sentence1_length': 10, 'average_sentence1_length': 108.83, 'max_sentence1_length': 462, 'unique_sentence1': 881, 'min_sentence2_length': 6, 'average_sentence2_length': 119.03, 'max_sentence2_length': 521, 'unique_sentence2': 881}, 'en-fr': {'num_samples': 890, 'number_of_characters': 197619, 'unique_pairs': 883, 'min_sentence1_length': 10, 'average_sentence1_length': 108.41, 'max_sentence1_length': 462, 'unique_sentence1': 883, 'min_sentence2_length': 6, 'average_sentence2_length': 113.63, 'max_sentence2_length': 493, 'unique_sentence2': 881}, 'en-it': {'num_samples': 929, 'number_of_characters': 191803, 'unique_pairs': 924, 'min_sentence1_length': 10, 'average_sentence1_length': 103.0, 'max_sentence1_length': 433, 'unique_sentence1': 922, 'min_sentence2_length': 7, 'average_sentence2_length': 103.46, 'max_sentence2_length': 444, 'unique_sentence2': 918}, 'en-ja': {'num_samples': 871, 'number_of_characters': 132742, 'unique_pairs': 867, 'min_sentence1_length': 10, 'average_sentence1_length': 109.81, 'max_sentence1_length': 462, 'unique_sentence1': 864, 'min_sentence2_length': 5, 'average_sentence2_length': 42.59, 'max_sentence2_length': 225, 'unique_sentence2': 866}, 'en-ko': {'num_samples': 879, 'number_of_characters': 142659, 'unique_pairs': 874, 'min_sentence1_length': 10, 'average_sentence1_length': 107.74, 'max_sentence1_length': 462, 'unique_sentence1': 872, 'min_sentence2_length': 3, 'average_sentence2_length': 54.56, 'max_sentence2_length': 250, 'unique_sentence2': 872}, 'en-nl': {'num_samples': 1003, 'number_of_characters': 189637, 'unique_pairs': 1000, 'min_sentence1_length': 10, 'average_sentence1_length': 95.27, 'max_sentence1_length': 433, 'unique_sentence1': 996, 'min_sentence2_length': 4, 'average_sentence2_length': 93.8, 'max_sentence2_length': 477, 'unique_sentence2': 1000}, 'en-ro': {'num_samples': 914, 'number_of_characters': 194128, 'unique_pairs': 910, 'min_sentence1_length': 10, 'average_sentence1_length': 104.72, 'max_sentence1_length': 433, 'unique_sentence1': 907, 'min_sentence2_length': 9, 'average_sentence2_length': 107.67, 'max_sentence2_length': 448, 'unique_sentence2': 910}, 'en-zh': {'num_samples': 879, 'number_of_characters': 131126, 'unique_pairs': 877, 'min_sentence1_length': 10, 'average_sentence1_length': 109.37, 'max_sentence1_length': 462, 'unique_sentence1': 872, 'min_sentence2_length': 2, 'average_sentence2_length': 39.81, 'max_sentence2_length': 230, 'unique_sentence2': 867}, 'fr-en': {'num_samples': 890, 'number_of_characters': 197619, 'unique_pairs': 883, 'min_sentence1_length': 6, 'average_sentence1_length': 113.63, 'max_sentence1_length': 493, 'unique_sentence1': 881, 'min_sentence2_length': 10, 'average_sentence2_length': 108.41, 'max_sentence2_length': 462, 'unique_sentence2': 883}, 'it-en': {'num_samples': 929, 'number_of_characters': 191803, 'unique_pairs': 924, 'min_sentence1_length': 7, 'average_sentence1_length': 103.46, 'max_sentence1_length': 444, 'unique_sentence1': 918, 'min_sentence2_length': 10, 'average_sentence2_length': 103.0, 'max_sentence2_length': 433, 'unique_sentence2': 922}, 'it-nl': {'num_samples': 1001, 'number_of_characters': 188858, 'unique_pairs': 998, 'min_sentence1_length': 7, 'average_sentence1_length': 94.64, 'max_sentence1_length': 459, 'unique_sentence1': 994, 'min_sentence2_length': 7, 'average_sentence2_length': 94.03, 'max_sentence2_length': 505, 'unique_sentence2': 998}, 'it-ro': {'num_samples': 914, 'number_of_characters': 193339, 'unique_pairs': 911, 'min_sentence1_length': 7, 'average_sentence1_length': 103.91, 'max_sentence1_length': 435, 'unique_sentence1': 907, 'min_sentence2_length': 9, 'average_sentence2_length': 107.62, 'max_sentence2_length': 448, 'unique_sentence2': 910}, 'ja-en': {'num_samples': 871, 'number_of_characters': 132742, 'unique_pairs': 867, 'min_sentence1_length': 5, 'average_sentence1_length': 42.59, 'max_sentence1_length': 225, 'unique_sentence1': 866, 'min_sentence2_length': 10, 'average_sentence2_length': 109.81, 'max_sentence2_length': 462, 'unique_sentence2': 864}, 'ko-en': {'num_samples': 879, 'number_of_characters': 142659, 'unique_pairs': 874, 'min_sentence1_length': 3, 'average_sentence1_length': 54.56, 'max_sentence1_length': 250, 'unique_sentence1': 872, 'min_sentence2_length': 10, 'average_sentence2_length': 107.74, 'max_sentence2_length': 462, 'unique_sentence2': 872}, 'nl-en': {'num_samples': 1003, 'number_of_characters': 189637, 'unique_pairs': 1000, 'min_sentence1_length': 4, 'average_sentence1_length': 93.8, 'max_sentence1_length': 477, 'unique_sentence1': 1000, 'min_sentence2_length': 10, 'average_sentence2_length': 95.27, 'max_sentence2_length': 433, 'unique_sentence2': 996}, 'nl-it': {'num_samples': 1001, 'number_of_characters': 188858, 'unique_pairs': 998, 'min_sentence1_length': 7, 'average_sentence1_length': 94.03, 'max_sentence1_length': 505, 'unique_sentence1': 998, 'min_sentence2_length': 7, 'average_sentence2_length': 94.64, 'max_sentence2_length': 459, 'unique_sentence2': 994}, 'nl-ro': {'num_samples': 913, 'number_of_characters': 191376, 'unique_pairs': 911, 'min_sentence1_length': 7, 'average_sentence1_length': 102.02, 'max_sentence1_length': 478, 'unique_sentence1': 909, 'min_sentence2_length': 9, 'average_sentence2_length': 107.59, 'max_sentence2_length': 515, 'unique_sentence2': 909}, 'ro-en': {'num_samples': 914, 'number_of_characters': 194128, 'unique_pairs': 910, 'min_sentence1_length': 9, 'average_sentence1_length': 107.67, 'max_sentence1_length': 448, 'unique_sentence1': 910, 'min_sentence2_length': 10, 'average_sentence2_length': 104.72, 'max_sentence2_length': 433, 'unique_sentence2': 907}, 'ro-it': {'num_samples': 914, 'number_of_characters': 193339, 'unique_pairs': 911, 'min_sentence1_length': 9, 'average_sentence1_length': 107.62, 'max_sentence1_length': 448, 'unique_sentence1': 910, 'min_sentence2_length': 7, 'average_sentence2_length': 103.91, 'max_sentence2_length': 435, 'unique_sentence2': 907}, 'ro-nl': {'num_samples': 913, 'number_of_characters': 191376, 'unique_pairs': 911, 'min_sentence1_length': 9, 'average_sentence1_length': 107.59, 'max_sentence1_length': 515, 'unique_sentence1': 909, 'min_sentence2_length': 7, 'average_sentence2_length': 102.02, 'max_sentence2_length': 478, 'unique_sentence2': 909}, 'zh-en': {'num_samples': 879, 'number_of_characters': 131126, 'unique_pairs': 877, 'min_sentence1_length': 2, 'average_sentence1_length': 39.81, 'max_sentence1_length': 230, 'unique_sentence1': 867, 'min_sentence2_length': 10, 'average_sentence2_length': 109.37, 'max_sentence2_length': 462, 'unique_sentence2': 872}}}} |
+| [ImageCoDeT2IMultiChoice](https://aclanthology.org/2022.acl-long.241.pdf) (Krojer et al., 2022) | ['eng'] | Any2AnyMultiChoice | it2i | [Web, Written] | None | None |
+| [ImageCoDeT2IRetrieval](https://aclanthology.org/2022.acl-long.241.pdf) (Krojer et al., 2022) | ['eng'] | Any2AnyRetrieval | t2i | [Web, Written] | None | None |
+| [ImageNet10Clustering](https://www.kaggle.com/datasets/liusha249/imagenet10) (Deng et al., 2009) | ['eng'] | ImageClustering | i2t | [Web] | None | None |
+| [ImageNetDog15Clustering](http://vision.stanford.edu/aditya86/ImageNetDogs/main.html) (Deng et al., 2009) | ['eng'] | ImageClustering | i2i | [Web] | None | None |
+| [Imagenet1k](https://ieeexplore.ieee.org/document/5206848) (Deng et al., 2009) | ['eng'] | ImageClassification | i2i | [Scene] | None | None |
+| [Imagenet1kZeroShot](https://ieeexplore.ieee.org/document/5206848) (Deng et al., 2009) | ['eng'] | ZeroShotClassification | i2t | [Scene] | None | None |
| [ImdbClassification](http://www.aclweb.org/anthology/P11-1015) | ['eng'] | Classification | p2p | [Reviews, Written] | None | None |
| [InappropriatenessClassification](https://aclanthology.org/2021.bsnlp-1.4) | ['rus'] | Classification | s2s | [Social, Web, Written] | None | None |
| [IndicCrosslingualSTS](https://huggingface.co/datasets/jaygala24/indic_sts) (Ramesh et al., 2022) | ['asm', 'ben', 'eng', 'guj', 'hin', 'kan', 'mal', 'mar', 'ory', 'pan', 'tam', 'tel', 'urd'] | STS | s2s | [Government, News, Non-fiction, Spoken, Spoken, Web, Written] | None | None |
@@ -279,6 +337,8 @@ The following tables give you an overview of the tasks in MTEB.
| [IndicSentimentClassification](https://arxiv.org/abs/2212.05409) (Sumanth Doddapaneni, 2022) | ['asm', 'ben', 'brx', 'guj', 'hin', 'kan', 'mal', 'mar', 'ory', 'pan', 'tam', 'tel', 'urd'] | Classification | s2s | [Reviews, Written] | None | None |
| [IndonesianIdClickbaitClassification](http://www.sciencedirect.com/science/article/pii/S2352340920311252) | ['ind'] | Classification | s2s | [News, Written] | None | None |
| [IndonesianMongabayConservationClassification](https://aclanthology.org/2023.sealp-1.4/) | ['ind'] | Classification | s2s | [Web, Written] | None | None |
+| [InfoSeekIT2ITRetrieval](https://aclanthology.org/2023.emnlp-main.925) (Chen et al., 2023) | ['eng'] | Any2AnyRetrieval | it2it | [Encyclopaedic] | None | None |
+| [InfoSeekIT2TRetrieval](https://aclanthology.org/2023.emnlp-main.925) (Chen et al., 2023) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | None | None |
| [InsurancePolicyInterpretationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
| [InternationalCitizenshipQuestionsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
| [IsiZuluNewsClassification](https://huggingface.co/datasets/dsfsi/za-isizulu-siswati-news) (Madodonga et al., 2023) | ['zul'] | Classification | s2s | [News, Written] | None | None |
@@ -311,6 +371,7 @@ The following tables give you an overview of the tasks in MTEB.
| [LEMBQMSumRetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) | ['eng'] | Retrieval | s2p | [Spoken, Written] | None | None |
| [LEMBSummScreenFDRetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) | ['eng'] | Retrieval | s2p | [Spoken, Written] | None | None |
| [LEMBWikimQARetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) (Ho et al., 2020) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None |
+| [LLaVAIT2TRetrieval](https://github.com/LinWeizheDragon/FLMR/blob/main/docs/Datasets.md) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | None | None |
| [LanguageClassification](https://huggingface.co/datasets/papluca/language-identification) (Conneau et al., 2018) | ['ara', 'bul', 'cmn', 'deu', 'ell', 'eng', 'fra', 'hin', 'ita', 'jpn', 'nld', 'pol', 'por', 'rus', 'spa', 'swa', 'tha', 'tur', 'urd', 'vie'] | Classification | s2s | [Fiction, Government, Non-fiction, Reviews, Web, Written] | {'test': 2048, 'train': 70000} | {'test': {'num_samples': 2048, 'number_of_characters': 224352, 'num_texts_in_train': 31, 'min_text_length': 14, 'average_text_length': 109.55, 'max_text_length': 1270, 'unique_text': 2025, 'unique_labels': 20, 'labels': {'17': {'count': 102}, '0': {'count': 102}, '11': {'count': 102}, '4': {'count': 103}, '3': {'count': 102}, '1': {'count': 102}, '10': {'count': 102}, '2': {'count': 103}, '16': {'count': 103}, '9': {'count': 103}, '5': {'count': 102}, '7': {'count': 102}, '13': {'count': 102}, '14': {'count': 103}, '12': {'count': 102}, '15': {'count': 103}, '19': {'count': 102}, '18': {'count': 102}, '6': {'count': 103}, '8': {'count': 103}}}, 'train': {'num_samples': 70000, 'number_of_characters': 7760299, 'num_texts_in_train': None, 'min_text_length': 2, 'average_text_length': 110.86, 'max_text_length': 2422, 'unique_text': 68978, 'unique_labels': 20, 'labels': {'12': {'count': 3500}, '1': {'count': 3500}, '19': {'count': 3500}, '15': {'count': 3500}, '13': {'count': 3500}, '11': {'count': 3500}, '17': {'count': 3500}, '14': {'count': 3500}, '16': {'count': 3500}, '5': {'count': 3500}, '0': {'count': 3500}, '8': {'count': 3500}, '7': {'count': 3500}, '2': {'count': 3500}, '3': {'count': 3500}, '10': {'count': 3500}, '6': {'count': 3500}, '18': {'count': 3500}, '4': {'count': 3500}, '9': {'count': 3500}}}} |
| [LccSentimentClassification](https://github.com/fnielsen/lcc-sentiment) | ['dan'] | Classification | s2s | [News, Web, Written] | None | None |
| [LeCaRDv2](https://github.com/THUIR/LeCaRDv2) (Haitao Li, 2023) | ['zho'] | Retrieval | p2p | [Legal, Written] | None | None |
@@ -340,6 +401,7 @@ The following tables give you an overview of the tasks in MTEB.
| [LitSearchRetrieval](https://github.com/princeton-nlp/LitSearch) (Ajith et al., 2024) | ['eng'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None |
| [LivedoorNewsClustering.v2](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Clustering | s2s | [News, Written] | None | None |
| [MAUDLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
+| [METI2IRetrieval](https://arxiv.org/abs/2202.01747) (Ypsilantis et al., 2021) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | None | None |
| [MIRACLReranking](https://project-miracl.github.io/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Reranking | s2s | [Encyclopaedic, Written] | None | None |
| [MIRACLRetrieval](http://miracl.ai/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None |
| [MIRACLRetrievalHardNegatives](http://miracl.ai/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None |
@@ -349,6 +411,10 @@ The following tables give you an overview of the tasks in MTEB.
| [MLSUMClusteringS2S.v2](https://huggingface.co/datasets/mteb/mlsum) (Scialom et al., 2020) | ['deu', 'fra', 'rus', 'spa'] | Clustering | s2s | [News, Written] | None | None |
| [MMarcoReranking](https://github.com/unicamp-dl/mMARCO) (Luiz Henrique Bonifacio, 2021) | ['cmn'] | Reranking | s2s | | None | None |
| [MMarcoRetrieval](https://arxiv.org/abs/2309.07597) (Shitao Xiao, 2024) | ['cmn'] | Retrieval | s2p | | None | None |
+| [MNIST](https://en.wikipedia.org/wiki/MNIST_database) (LeCun et al., 2010) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None |
+| [MNISTZeroShot](https://en.wikipedia.org/wiki/MNIST_database) (LeCun et al., 2010) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None |
+| [MSCOCOI2TRetrieval](https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48) (Lin et al., 2014) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | None | None |
+| [MSCOCOT2IRetrieval](https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48) (Lin et al., 2014) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | None | None |
| [MSMARCO](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Academic, Blog, Encyclopaedic, Government, Medical, News, Non-fiction, Reviews, Social, Web] | None | None |
| [MSMARCO-Fa](https://huggingface.co/datasets/MCINext/msmarco-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None |
| [MSMARCO-PL](https://microsoft.github.io/msmarco/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | None |
@@ -370,6 +436,8 @@ The following tables give you an overview of the tasks in MTEB.
| [MedicalRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | None |
| [MedrxivClusteringP2P.v2](https://api.medrxiv.org/) | ['eng'] | Clustering | p2p | [Academic, Medical, Written] | {'test': 37500} | {'test': {'num_samples': 37500, 'number_of_characters': 74294927, 'min_text_length': 148, 'average_text_length': 1981.2, 'max_text_length': 38759, 'min_labels_per_text': 6, 'average_labels_per_text': 1.0, 'max_labels_per_text': 8830, 'unique_labels': 51, 'labels': {'epidemiology': {'count': 6656}, 'public and global health': {'count': 3595}, 'oncology': {'count': 845}, 'allergy and immunology': {'count': 464}, 'orthopedics': {'count': 104}, 'health informatics': {'count': 1107}, 'occupational and environmental health': {'count': 415}, 'infectious diseases': {'count': 8830}, 'genetic and genomic medicine': {'count': 1918}, 'health policy': {'count': 527}, 'gastroenterology': {'count': 343}, 'radiology and imaging': {'count': 541}, 'pain medicine': {'count': 121}, 'neurology': {'count': 1773}, 'primary care research': {'count': 232}, 'rheumatology': {'count': 189}, 'endocrinology': {'count': 419}, 'hematology': {'count': 202}, 'addiction medicine': {'count': 178}, 'pediatrics': {'count': 589}, 'cardiovascular medicine': {'count': 855}, 'obstetrics and gynecology': {'count': 373}, 'health systems and quality improvement': {'count': 491}, 'nephrology': {'count': 241}, 'respiratory medicine': {'count': 482}, 'geriatric medicine': {'count': 169}, 'dentistry and oral medicine': {'count': 159}, 'psychiatry and clinical psychology': {'count': 1781}, 'nutrition': {'count': 240}, 'intensive care and critical care medicine': {'count': 368}, 'rehabilitation medicine and physical therapy': {'count': 322}, 'otolaryngology': {'count': 166}, 'nursing': {'count': 93}, 'transplantation': {'count': 118}, 'health economics': {'count': 327}, 'sports medicine': {'count': 180}, 'hiv aids': {'count': 363}, 'dermatology': {'count': 98}, 'pathology': {'count': 223}, 'emergency medicine': {'count': 191}, 'pharmacology and therapeutics': {'count': 221}, 'ophthalmology': {'count': 220}, 'medical ethics': {'count': 46}, 'palliative medicine': {'count': 45}, 'sexual and reproductive health': {'count': 156}, 'medical education': {'count': 203}, 'surgery': {'count': 162}, 'urology': {'count': 65}, 'anesthesia': {'count': 72}, 'toxicology': {'count': 16}, 'forensic medicine': {'count': 6}}}} |
| [MedrxivClusteringS2S.v2](https://api.medrxiv.org/) | ['eng'] | Clustering | s2s | [Academic, Medical, Written] | {'test': 37500} | {'test': {'num_samples': 37500, 'number_of_characters': 4301276, 'min_text_length': 18, 'average_text_length': 114.7, 'max_text_length': 339, 'min_labels_per_text': 6, 'average_labels_per_text': 1.0, 'max_labels_per_text': 8830, 'unique_labels': 51, 'labels': {'epidemiology': {'count': 6656}, 'public and global health': {'count': 3595}, 'oncology': {'count': 845}, 'allergy and immunology': {'count': 464}, 'orthopedics': {'count': 104}, 'health informatics': {'count': 1107}, 'occupational and environmental health': {'count': 415}, 'infectious diseases': {'count': 8830}, 'genetic and genomic medicine': {'count': 1918}, 'health policy': {'count': 527}, 'gastroenterology': {'count': 343}, 'radiology and imaging': {'count': 541}, 'pain medicine': {'count': 121}, 'neurology': {'count': 1773}, 'primary care research': {'count': 232}, 'rheumatology': {'count': 189}, 'endocrinology': {'count': 419}, 'hematology': {'count': 202}, 'addiction medicine': {'count': 178}, 'pediatrics': {'count': 589}, 'cardiovascular medicine': {'count': 855}, 'obstetrics and gynecology': {'count': 373}, 'health systems and quality improvement': {'count': 491}, 'nephrology': {'count': 241}, 'respiratory medicine': {'count': 482}, 'geriatric medicine': {'count': 169}, 'dentistry and oral medicine': {'count': 159}, 'psychiatry and clinical psychology': {'count': 1781}, 'nutrition': {'count': 240}, 'intensive care and critical care medicine': {'count': 368}, 'rehabilitation medicine and physical therapy': {'count': 322}, 'otolaryngology': {'count': 166}, 'nursing': {'count': 93}, 'transplantation': {'count': 118}, 'health economics': {'count': 327}, 'sports medicine': {'count': 180}, 'hiv aids': {'count': 363}, 'dermatology': {'count': 98}, 'pathology': {'count': 223}, 'emergency medicine': {'count': 191}, 'pharmacology and therapeutics': {'count': 221}, 'ophthalmology': {'count': 220}, 'medical ethics': {'count': 46}, 'palliative medicine': {'count': 45}, 'sexual and reproductive health': {'count': 156}, 'medical education': {'count': 203}, 'surgery': {'count': 162}, 'urology': {'count': 65}, 'anesthesia': {'count': 72}, 'toxicology': {'count': 16}, 'forensic medicine': {'count': 6}}}} |
+| [MemotionI2TRetrieval](https://aclanthology.org/2020.semeval-1.99/) (Sharma et al., 2020) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | None | None |
+| [MemotionT2IRetrieval](https://aclanthology.org/2020.semeval-1.99/) (Sharma et al., 2020) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | None | None |
| [MewsC16JaClustering](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Clustering | s2s | [News, Written] | None | None |
| [MindSmallReranking](https://msnews.github.io/assets/doc/ACL2020_MIND.pdf) | ['eng'] | Reranking | s2s | [News, Written] | None | None |
| MintakaRetrieval | ['ara', 'deu', 'fra', 'hin', 'ita', 'jpn', 'por', 'spa'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None |
@@ -385,6 +453,7 @@ The following tables give you an overview of the tasks in MTEB.
| [NFCorpus](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | {'test': 3956} | {'test': {'number_of_characters': 1612.55, 'num_samples': 3956, 'num_queries': 323, 'num_documents': 3633, 'average_document_length': 0.44, 'average_query_length': 0.07, 'average_relevant_docs_per_query': 38.19}} |
| [NFCorpus-Fa](https://huggingface.co/datasets/MCINext/nfcorpus-fa) | ['fas'] | Retrieval | s2p | [Medical] | None | None |
| [NFCorpus-PL](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None |
+| [NIGHTSI2IRetrieval](https://proceedings.neurips.cc/paper_files/paper/2023/hash/9f09f316a3eaf59d9ced5ffaefe97e0f-Abstract-Conference.html) (Fu et al., 2024) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | None | None |
| [NLPJournalAbsIntroRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None |
| [NLPJournalTitleAbsRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None |
| [NLPJournalTitleIntroRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None |
@@ -431,6 +500,7 @@ The following tables give you an overview of the tasks in MTEB.
| [NusaTranslationBitextMining](https://huggingface.co/datasets/indonlp/nusatranslation_mt) (Cahyawijaya et al., 2023) | ['abs', 'bbc', 'bew', 'bhp', 'ind', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | BitextMining | s2s | [Social, Written] | {'train': 50200} | {'train': {'num_samples': 50200, 'number_of_characters': 14759870, 'unique_pairs': 50140, 'min_sentence1_length': 5, 'average_sentence1_length': 145.46, 'max_sentence1_length': 873, 'unique_sentence1': 8258, 'min_sentence2_length': 5, 'average_sentence2_length': 148.57, 'max_sentence2_length': 980, 'unique_sentence2': 50102, 'hf_subset_descriptive_stats': {'ind-abs': {'num_samples': 1000, 'number_of_characters': 295680, 'unique_pairs': 999, 'min_sentence1_length': 5, 'average_sentence1_length': 148.37, 'max_sentence1_length': 727, 'unique_sentence1': 998, 'min_sentence2_length': 6, 'average_sentence2_length': 147.31, 'max_sentence2_length': 629, 'unique_sentence2': 998}, 'ind-btk': {'num_samples': 6600, 'number_of_characters': 1927907, 'unique_pairs': 6597, 'min_sentence1_length': 5, 'average_sentence1_length': 145.37, 'max_sentence1_length': 873, 'unique_sentence1': 6521, 'min_sentence2_length': 5, 'average_sentence2_length': 146.74, 'max_sentence2_length': 980, 'unique_sentence2': 6596}, 'ind-bew': {'num_samples': 6600, 'number_of_characters': 1939300, 'unique_pairs': 6595, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 6, 'average_sentence2_length': 148.41, 'max_sentence2_length': 840, 'unique_sentence2': 6590}, 'ind-bhp': {'num_samples': 1000, 'number_of_characters': 261666, 'unique_pairs': 1000, 'min_sentence1_length': 11, 'average_sentence1_length': 133.53, 'max_sentence1_length': 468, 'unique_sentence1': 999, 'min_sentence2_length': 10, 'average_sentence2_length': 128.14, 'max_sentence2_length': 459, 'unique_sentence2': 999}, 'ind-jav': {'num_samples': 6600, 'number_of_characters': 1922162, 'unique_pairs': 6594, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 5, 'average_sentence2_length': 145.81, 'max_sentence2_length': 854, 'unique_sentence2': 6585}, 'ind-mad': {'num_samples': 6600, 'number_of_characters': 1973257, 'unique_pairs': 6598, 'min_sentence1_length': 5, 'average_sentence1_length': 145.36, 'max_sentence1_length': 873, 'unique_sentence1': 6521, 'min_sentence2_length': 5, 'average_sentence2_length': 153.62, 'max_sentence2_length': 827, 'unique_sentence2': 6592}, 'ind-mak': {'num_samples': 6600, 'number_of_characters': 1953868, 'unique_pairs': 6594, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 6, 'average_sentence2_length': 150.61, 'max_sentence2_length': 888, 'unique_sentence2': 6586}, 'ind-min': {'num_samples': 6600, 'number_of_characters': 1937033, 'unique_pairs': 6595, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 6, 'average_sentence2_length': 148.06, 'max_sentence2_length': 837, 'unique_sentence2': 6591}, 'ind-mui': {'num_samples': 1000, 'number_of_characters': 301448, 'unique_pairs': 1000, 'min_sentence1_length': 11, 'average_sentence1_length': 150.45, 'max_sentence1_length': 451, 'unique_sentence1': 997, 'min_sentence2_length': 11, 'average_sentence2_length': 150.99, 'max_sentence2_length': 450, 'unique_sentence2': 1000}, 'ind-rej': {'num_samples': 1000, 'number_of_characters': 291205, 'unique_pairs': 1000, 'min_sentence1_length': 9, 'average_sentence1_length': 151.62, 'max_sentence1_length': 873, 'unique_sentence1': 998, 'min_sentence2_length': 8, 'average_sentence2_length': 139.58, 'max_sentence2_length': 784, 'unique_sentence2': 1000}, 'ind-sun': {'num_samples': 6600, 'number_of_characters': 1956344, 'unique_pairs': 6591, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 5, 'average_sentence2_length': 150.99, 'max_sentence2_length': 881, 'unique_sentence2': 6588}}}} |
| [NusaX-senti](https://arxiv.org/abs/2205.15960) (Winata et al., 2022) | ['ace', 'ban', 'bbc', 'bjn', 'bug', 'eng', 'ind', 'jav', 'mad', 'min', 'nij', 'sun'] | Classification | s2s | [Constructed, Reviews, Social, Web, Written] | None | None |
| [NusaXBitextMining](https://huggingface.co/datasets/indonlp/NusaX-senti/) (Winata et al., 2023) | ['ace', 'ban', 'bbc', 'bjn', 'bug', 'eng', 'ind', 'jav', 'mad', 'min', 'nij', 'sun'] | BitextMining | s2s | [Reviews, Written] | None | None |
+| [OKVQAIT2TRetrieval](https://okvqa.allenai.org/) (Marino et al., 2019) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | None | None |
| [OPP115DataRetentionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
| [OPP115DataSecurityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
| [OPP115DoNotTrackLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
@@ -440,6 +510,8 @@ The following tables give you an overview of the tasks in MTEB.
| [OPP115ThirdPartySharingCollectionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
| [OPP115UserAccessEditAndDeletionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
| [OPP115UserChoiceControlLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
+| [OVENIT2ITRetrieval](https://openaccess.thecvf.com/content/ICCV2023/html/Hu_Open-domain_Visual_Entity_Recognition_Towards_Recognizing_Millions_of_Wikipedia_Entities_ICCV_2023_paper.html) (Hu et al., 2023) | ['eng'] | Any2AnyRetrieval | it2it | [Encyclopaedic] | None | None |
+| [OVENIT2TRetrieval](https://openaccess.thecvf.com/content/ICCV2023/html/Hu_Open-domain_Visual_Entity_Recognition_Towards_Recognizing_Millions_of_Wikipedia_Entities_ICCV_2023_paper.html) (Hu et al., 2023) | ['eng'] | Any2AnyRetrieval | it2i | [Encyclopaedic] | None | None |
| [Ocnli](https://arxiv.org/abs/2010.05444) (Hai Hu, 2020) | ['cmn'] | PairClassification | s2s | | None | None |
| [OdiaNewsClassification](https://github.com/goru001/nlp-for-odia) (Anoop Kunchukuttan, 2020) | ['ory'] | Classification | s2s | [News, Written] | None | None |
| [OnlineShopping](https://aclanthology.org/2023.nodalida-1.20/) (Xiao et al., 2023) | ['cmn'] | Classification | s2s | | None | None |
@@ -447,6 +519,9 @@ The following tables give you an overview of the tasks in MTEB.
| [OpusparcusPC](https://gem-benchmark.com/data_cards/opusparcus) (Mathias Creutz, 2018) | ['deu', 'eng', 'fin', 'fra', 'rus', 'swe'] | PairClassification | s2s | [Spoken, Spoken] | None | None |
| [OralArgumentQuestionPurposeLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
| [OverrulingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
+| [OxfordFlowersClassification](https://huggingface.co/datasets/nelorth/oxford-flowers/viewer/default/train) | ['eng'] | ImageClassification | i2i | [Reviews] | None | None |
+| [OxfordPets](https://arxiv.org/abs/1306.5151) (Subhransu Maji, 2013) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None |
+| [OxfordPetsZeroShot](https://arxiv.org/abs/1306.5151) (Subhransu Maji, 2013) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None |
| [PAC](https://arxiv.org/pdf/2211.13112.pdf) (Łukasz Augustyniak, 2022) | ['pol'] | Classification | p2p | [Legal, Written] | None | None |
| [PAWSX](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None |
| [PIQA](https://arxiv.org/abs/1911.11641) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None |
@@ -454,6 +529,8 @@ The following tables give you an overview of the tasks in MTEB.
| [PSC](http://www.lrec-conf.org/proceedings/lrec2014/pdf/1211_Paper.pdf) | ['pol'] | PairClassification | s2s | [News, Written] | None | None |
| [ParsinluEntail](https://github.com/persiannlp/parsinlu) | ['fas'] | PairClassification | s2s | | None | None |
| [ParsinluQueryParaphPC](https://huggingface.co/datasets/persiannlp/parsinlu_query_paraphrasing) | ['fas'] | PairClassification | s2s | | None | None |
+| [PatchCamelyon](https://link.springer.com/chapter/10.1007/978-3-030-00934-2_24) | ['eng'] | ImageClassification | i2i | [Medical] | None | None |
+| [PatchCamelyonZeroShot](https://link.springer.com/chapter/10.1007/978-3-030-00934-2_24) | ['eng'] | ZeroShotClassification | i2t | [Medical] | None | None |
| [PatentClassification](https://aclanthology.org/P19-1212.pdf) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
| [PawsXPairClassification](https://arxiv.org/abs/1908.11828) (Yinfei Yang, 2019) | ['cmn', 'deu', 'eng', 'fra', 'jpn', 'kor', 'spa'] | PairClassification | s2s | [Encyclopaedic, Web, Written] | {'test': 14000, 'validation': 14000} | {'test': {'num_samples': 14000, 'number_of_characters': 2551922, 'min_sentence1_length': 2, 'avg_sentence1_length': 91.18, 'max_sentence1_length': 268, 'unique_sentence1': 13404, 'min_sentence2_length': 2, 'avg_sentence2_length': 91.1, 'max_sentence2_length': 247, 'unique_sentence2': 13462, 'unique_labels': 2, 'labels': {'1': {'count': 6285}, '0': {'count': 7715}}, 'hf_subset_descriptive_stats': {'de': {'num_samples': 2000, 'number_of_characters': 478034, 'min_sentence1_length': 2, 'avg_sentence1_length': 119.78, 'max_sentence1_length': 268, 'unique_sentence1': 1934, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.24, 'max_sentence2_length': 235, 'unique_sentence2': 1938, 'unique_labels': 2, 'labels': {'1': {'count': 895}, '0': {'count': 1105}}}, 'en': {'num_samples': 2000, 'number_of_characters': 454362, 'min_sentence1_length': 25, 'avg_sentence1_length': 113.76, 'max_sentence1_length': 209, 'unique_sentence1': 1761, 'min_sentence2_length': 25, 'avg_sentence2_length': 113.42, 'max_sentence2_length': 209, 'unique_sentence2': 1800, 'unique_labels': 2, 'labels': {'1': {'count': 907}, '0': {'count': 1093}}}, 'es': {'num_samples': 2000, 'number_of_characters': 471226, 'min_sentence1_length': 2, 'avg_sentence1_length': 117.81, 'max_sentence1_length': 226, 'unique_sentence1': 1955, 'min_sentence2_length': 22, 'avg_sentence2_length': 117.8, 'max_sentence2_length': 233, 'unique_sentence2': 1959, 'unique_labels': 2, 'labels': {'1': {'count': 907}, '0': {'count': 1093}}}, 'fr': {'num_samples': 2000, 'number_of_characters': 480033, 'min_sentence1_length': 2, 'avg_sentence1_length': 120.03, 'max_sentence1_length': 238, 'unique_sentence1': 1954, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.99, 'max_sentence2_length': 247, 'unique_sentence2': 1953, 'unique_labels': 2, 'labels': {'1': {'count': 903}, '0': {'count': 1097}}}, 'ja': {'num_samples': 2000, 'number_of_characters': 235106, 'min_sentence1_length': 2, 'avg_sentence1_length': 58.68, 'max_sentence1_length': 192, 'unique_sentence1': 1944, 'min_sentence2_length': 2, 'avg_sentence2_length': 58.88, 'max_sentence2_length': 198, 'unique_sentence2': 1941, 'unique_labels': 2, 'labels': {'1': {'count': 883}, '0': {'count': 1117}}}, 'ko': {'num_samples': 2000, 'number_of_characters': 260149, 'min_sentence1_length': 2, 'avg_sentence1_length': 64.96, 'max_sentence1_length': 153, 'unique_sentence1': 1954, 'min_sentence2_length': 2, 'avg_sentence2_length': 65.11, 'max_sentence2_length': 159, 'unique_sentence2': 1969, 'unique_labels': 2, 'labels': {'1': {'count': 896}, '0': {'count': 1104}}}, 'zh': {'num_samples': 2000, 'number_of_characters': 173012, 'min_sentence1_length': 2, 'avg_sentence1_length': 43.23, 'max_sentence1_length': 120, 'unique_sentence1': 1909, 'min_sentence2_length': 2, 'avg_sentence2_length': 43.27, 'max_sentence2_length': 113, 'unique_sentence2': 1909, 'unique_labels': 2, 'labels': {'1': {'count': 894}, '0': {'count': 1106}}}}}, 'validation': {'num_samples': 14000, 'number_of_characters': 2524625, 'min_sentence1_length': 2, 'avg_sentence1_length': 90.13, 'max_sentence1_length': 248, 'unique_sentence1': 13357, 'min_sentence2_length': 2, 'avg_sentence2_length': 90.2, 'max_sentence2_length': 275, 'unique_sentence2': 13397, 'unique_labels': 2, 'labels': {'1': {'count': 5948}, '0': {'count': 8052}}, 'hf_subset_descriptive_stats': {'de': {'num_samples': 2000, 'number_of_characters': 467643, 'min_sentence1_length': 2, 'avg_sentence1_length': 116.82, 'max_sentence1_length': 248, 'unique_sentence1': 1914, 'min_sentence2_length': 2, 'avg_sentence2_length': 117.0, 'max_sentence2_length': 275, 'unique_sentence2': 1920, 'unique_labels': 2, 'labels': {'1': {'count': 831}, '0': {'count': 1169}}}, 'en': {'num_samples': 2000, 'number_of_characters': 451931, 'min_sentence1_length': 25, 'avg_sentence1_length': 113.11, 'max_sentence1_length': 213, 'unique_sentence1': 1758, 'min_sentence2_length': 25, 'avg_sentence2_length': 112.86, 'max_sentence2_length': 213, 'unique_sentence2': 1771, 'unique_labels': 2, 'labels': {'1': {'count': 863}, '0': {'count': 1137}}}, 'es': {'num_samples': 2000, 'number_of_characters': 466112, 'min_sentence1_length': 2, 'avg_sentence1_length': 116.33, 'max_sentence1_length': 240, 'unique_sentence1': 1938, 'min_sentence2_length': 2, 'avg_sentence2_length': 116.73, 'max_sentence2_length': 241, 'unique_sentence2': 1941, 'unique_labels': 2, 'labels': {'1': {'count': 847}, '0': {'count': 1153}}}, 'fr': {'num_samples': 2000, 'number_of_characters': 478510, 'min_sentence1_length': 2, 'avg_sentence1_length': 119.5, 'max_sentence1_length': 233, 'unique_sentence1': 1933, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.75, 'max_sentence2_length': 246, 'unique_sentence2': 1939, 'unique_labels': 2, 'labels': {'1': {'count': 860}, '0': {'count': 1140}}}, 'ja': {'num_samples': 2000, 'number_of_characters': 229655, 'min_sentence1_length': 2, 'avg_sentence1_length': 57.51, 'max_sentence1_length': 126, 'unique_sentence1': 1957, 'min_sentence2_length': 2, 'avg_sentence2_length': 57.32, 'max_sentence2_length': 121, 'unique_sentence2': 1969, 'unique_labels': 2, 'labels': {'1': {'count': 854}, '0': {'count': 1146}}}, 'ko': {'num_samples': 2000, 'number_of_characters': 261355, 'min_sentence1_length': 2, 'avg_sentence1_length': 65.16, 'max_sentence1_length': 178, 'unique_sentence1': 1963, 'min_sentence2_length': 2, 'avg_sentence2_length': 65.52, 'max_sentence2_length': 174, 'unique_sentence2': 1968, 'unique_labels': 2, 'labels': {'1': {'count': 840}, '0': {'count': 1160}}}, 'zh': {'num_samples': 2000, 'number_of_characters': 169419, 'min_sentence1_length': 2, 'avg_sentence1_length': 42.45, 'max_sentence1_length': 101, 'unique_sentence1': 1899, 'min_sentence2_length': 2, 'avg_sentence2_length': 42.26, 'max_sentence2_length': 120, 'unique_sentence2': 1895, 'unique_labels': 2, 'labels': {'1': {'count': 853}, '0': {'count': 1147}}}}}} |
| [PersianFoodSentimentClassification](https://hooshvare.github.io/docs/datasets/sa) (Mehrdad Farahani et al., 2020) | ['fas'] | Classification | s2s | [Reviews, Written] | None | None |
@@ -486,10 +563,27 @@ The following tables give you an overview of the tasks in MTEB.
| [QuoraRetrievalHardNegatives](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | | None | None |
| [RARbCode](https://arxiv.org/abs/2404.06347) (Xiao et al., 2024) | ['eng'] | Retrieval | s2p | [Programming, Written] | None | None |
| [RARbMath](https://arxiv.org/abs/2404.06347) (Xiao et al., 2024) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None |
+| [RESISC45](https://ieeexplore.ieee.org/abstract/document/7891544) (Cheng et al., 2017) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None |
+| [RESISC45ZeroShot](https://ieeexplore.ieee.org/abstract/document/7891544) (Cheng et al., 2017) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None |
+| [ROxfordEasyI2IMultiChoice](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyMultiChoice | i2i | [Web] | None | None |
+| [ROxfordEasyI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | None | None |
+| [ROxfordHardI2IMultiChoice](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyMultiChoice | i2i | [Web] | None | None |
+| [ROxfordHardI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | None | None |
+| [ROxfordMediumI2IMultiChoice](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyMultiChoice | i2i | [Web] | None | None |
+| [ROxfordMediumI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | None | None |
+| [RP2kI2IRetrieval](https://arxiv.org/abs/2006.12634) (Peng et al., 2020) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | None | None |
+| [RParisEasyI2IMultiChoice](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyMultiChoice | i2i | [Web] | None | None |
+| [RParisEasyI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | None | None |
+| [RParisHardI2IMultiChoice](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyMultiChoice | i2i | [Web] | None | None |
+| [RParisHardI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | None | None |
+| [RParisMediumI2IMultiChoice](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyMultiChoice | i2i | [Web] | None | None |
+| [RParisMediumI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | None | None |
| [RTE3](https://aclanthology.org/W07-1401/) | ['deu', 'eng', 'fra', 'ita'] | PairClassification | s2s | [Encyclopaedic, News, Web, Written] | None | None |
| [RUParaPhraserSTS](https://aclanthology.org/2020.ngt-1.6) (Pivovarova et al., 2017) | ['rus'] | STS | s2s | [News, Written] | None | None |
+| [ReMuQIT2TRetrieval](https://github.com/luomancs/ReMuQ) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | None | None |
| [RedditClustering.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | s2s | [Social, Web, Written] | None | None |
| [RedditClusteringP2P.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | p2p | [Social, Web, Written] | {'test': 459389} | {'test': {'num_samples': 459389, 'number_of_characters': 334286895, 'min_text_length': 79, 'average_text_length': 727.68, 'max_text_length': 4359, 'min_labels_per_text': 2, 'average_labels_per_text': 1.0, 'max_labels_per_text': 77908, 'unique_labels': 440, 'labels': {'FortNiteBR': {'count': 436}, 'buildapc': {'count': 8484}, 'offmychest': {'count': 570}, 'nus': {'count': 45}, 'relationship_advice': {'count': 16651}, 'premed': {'count': 201}, 'dogecoin': {'count': 8108}, 'GamingLaptops': {'count': 183}, 'asktransgender': {'count': 326}, 'MachineLearning': {'count': 61}, 'puppy101': {'count': 1597}, 'GunAccessoriesForSale': {'count': 2619}, 'Random_Acts_Of_Amazon': {'count': 1115}, 'Catholicism': {'count': 183}, 'MonsterHunter': {'count': 218}, 'tipofmypenis': {'count': 87}, 'samsung': {'count': 69}, 'PersonalFinanceCanada': {'count': 341}, 'Dyson_Sphere_Program': {'count': 55}, 'bleach': {'count': 41}, 'AmItheAsshole': {'count': 3730}, 'WallStreetbetsELITE': {'count': 328}, 'GlobalPowers': {'count': 35}, 'ABraThatFits': {'count': 159}, 'PokemonGoFriends': {'count': 1165}, 'NoMansSkyTheGame': {'count': 259}, 'masseffect': {'count': 233}, 'dating_advice': {'count': 559}, 'yoga': {'count': 50}, 'depression': {'count': 515}, 'COVID19positive': {'count': 180}, 'generationology': {'count': 37}, 'feedthebeast': {'count': 192}, 'EliteDangerous': {'count': 270}, 'alcoholicsanonymous': {'count': 93}, 'GoRVing': {'count': 35}, 'thedivision': {'count': 111}, 'breakingmom': {'count': 105}, 'AskAnAmerican': {'count': 80}, 'HypnoFair': {'count': 5}, 'JustUnsubbed': {'count': 13}, 'socialanxiety': {'count': 123}, 'dirtykikpals': {'count': 202}, 'askTO': {'count': 126}, 'AskCulinary': {'count': 108}, 'Bogleheads': {'count': 71}, 'dragonquest': {'count': 45}, 'NoContract': {'count': 30}, 'gorillaz': {'count': 14}, 'MondoGore': {'count': 8}, 'comicswap': {'count': 56}, 'VirtualYoutubers': {'count': 92}, 'Gta5Modding': {'count': 28}, 'obs': {'count': 61}, 'vcu': {'count': 9}, 'KingkillerChronicle': {'count': 17}, 'AmongUs': {'count': 41}, 'wireshark': {'count': 3}, 'Dodocodes': {'count': 46}, 'Aliexpress': {'count': 40}, 'LearnerDriverUK': {'count': 12}, 'PanicAttack': {'count': 23}, 'KassadinMains': {'count': 10}, 'islam': {'count': 93}, 'chronotrigger': {'count': 4}, 'skincareexchange': {'count': 13}, 'PokemonHome': {'count': 21}, 'survivinginfidelity': {'count': 71}, 'igcse': {'count': 21}, 'C25K': {'count': 21}, 'aorus': {'count': 2}, 'idleon': {'count': 19}, 'photography': {'count': 22}, 'cryptocoins': {'count': 7}, 'CanaryWharfBets': {'count': 7}, 'KillingEve': {'count': 7}, 'GameBuilderGarage': {'count': 16}, 'SauceSharingCommunity': {'count': 7}, 'turo': {'count': 9}, 'foodscience': {'count': 14}, 'HIMYM': {'count': 20}, 'HauntingOfHillHouse': {'count': 4}, 'GoodNotes': {'count': 8}, 'RedditWritesSeinfeld': {'count': 6}, 'AirReps': {'count': 2}, 'ADHD': {'count': 3811}, 'BuddyCrossing': {'count': 446}, 'libraryofruina': {'count': 98}, 'SluttyConfessions': {'count': 2787}, 'tipofmytongue': {'count': 7145}, 'fleshlight': {'count': 128}, 'amcstock': {'count': 13910}, 'teenagers': {'count': 77908}, 'suggestmeabook': {'count': 1540}, 'dirtypenpals': {'count': 5587}, 'MinecraftServer': {'count': 177}, 'CreditCards': {'count': 669}, 'Guitar': {'count': 10952}, 'rpg': {'count': 529}, 'NoFap': {'count': 14853}, 'lfg': {'count': 1093}, 'MarsWallStreet': {'count': 935}, 'SummonSign': {'count': 931}, 'AssassinsCreedValhala': {'count': 295}, 'hoi4': {'count': 432}, 'Coins4Sale': {'count': 260}, 'xbox': {'count': 459}, 'TooAfraidToAsk': {'count': 7404}, 'NBA2k': {'count': 553}, 'KGBTR': {'count': 943}, 'roblox': {'count': 220}, 'salesforce': {'count': 214}, 'TwoXChromosomes': {'count': 1736}, 'mechmarket': {'count': 4863}, 'Gaming_Headsets': {'count': 103}, 'pittsburgh': {'count': 189}, 'CryptoMars': {'count': 1606}, 'FridayNightFunkin': {'count': 378}, 'vaginismus': {'count': 122}, 'transpositive': {'count': 10}, 'comicbooks': {'count': 274}, 'BDSMcommunity': {'count': 185}, 'aliens': {'count': 201}, 'Scotch': {'count': 64}, 'KikRoleplay': {'count': 141}, 'Kayaking': {'count': 91}, '196': {'count': 47}, 'digimon': {'count': 140}, 'Evernote': {'count': 42}, 'logh': {'count': 22}, 'arlington': {'count': 15}, 'Adopted': {'count': 8}, 'DissonautUniverse': {'count': 4}, 'Midsommar': {'count': 12}, 'SofiawithanF': {'count': 83}, 'xmpp': {'count': 6}, 'ZombsRoyale': {'count': 16}, 'accesscontrol': {'count': 8}, 'WetlanderHumor': {'count': 2}, 'PoonamPandeyFanatics': {'count': 2}, 'screenplaychallenge': {'count': 2}, 'scatstories': {'count': 2}, 'techsupport': {'count': 290}, 'whatcarshouldIbuy': {'count': 79}, 'Stormlight_Archive': {'count': 15}, 'deadbydaylight': {'count': 126}, 'bicycling': {'count': 27}, 'oculus': {'count': 64}, 'Cartalk': {'count': 33}, 'Sims4': {'count': 43}, 'NoFeeAC': {'count': 95}, 'Crypto_com': {'count': 37}, 'ITCareerQuestions': {'count': 259}, 'aromantic': {'count': 18}, 'Revu': {'count': 3}, 'exalted': {'count': 2}, 'HilariaBaldwin': {'count': 20}, 'Testosterone': {'count': 35}, 'Screenwriting': {'count': 170}, 'LifeProTips': {'count': 49}, 'steinsgate': {'count': 13}, 'Baystreetbets': {'count': 10}, 'AskGirls': {'count': 7}, 'idlechampions': {'count': 7}, 'facebook': {'count': 17}, 'tf2trade': {'count': 4}, 'mfdoom': {'count': 3}, 'FiddlesticksMains': {'count': 2}, 'HFY': {'count': 10}, 'FiestaST': {'count': 2}, 'whatsthatbook': {'count': 994}, 'GearsOfWar': {'count': 879}, 'KazuhaMains': {'count': 175}, 'RepTime': {'count': 211}, 'AstroGaming': {'count': 141}, 'metalgearsolid': {'count': 152}, 'qBittorrent': {'count': 39}, 'ELLIPAL_Official': {'count': 24}, 'raisedbynarcissists': {'count': 4895}, 'unpopularopinion': {'count': 14901}, 'ACTrade': {'count': 5679}, 'askcarsales': {'count': 1339}, 'AskVet': {'count': 1357}, 'whowouldwin': {'count': 4493}, 'playstation': {'count': 1362}, 'anime': {'count': 6531}, 'GME': {'count': 12577}, 'DotA2': {'count': 2004}, 'cryptostreetbets': {'count': 2241}, 'MonsterHunterWorld': {'count': 698}, 'Market76': {'count': 14274}, 'DnD': {'count': 5092}, 'leagueoflegends': {'count': 3683}, 'doordash_drivers': {'count': 1626}, 'theta_network': {'count': 489}, 'exmuslim': {'count': 1369}, 'gonewildaudio': {'count': 2998}, 'conspiracy': {'count': 3587}, 'heroesofthestorm': {'count': 535}, 'FanFiction': {'count': 2782}, 'Doom': {'count': 1251}, 'texas': {'count': 269}, 'Vent': {'count': 1738}, 'selfimprovement': {'count': 1284}, 'youtubers': {'count': 706}, 'askseddit': {'count': 237}, 'boardgames': {'count': 1237}, 'bravelydefault': {'count': 347}, 'ConquerorsBlade': {'count': 238}, 'ChronicPain': {'count': 527}, 'teenagersnew': {'count': 256}, 'brasil': {'count': 1092}, 'MatthiasSubmissions': {'count': 921}, 'MarylandUnemployment': {'count': 314}, 'SaltLakeCity': {'count': 411}, 'BokunoheroFanfiction': {'count': 155}, 'BenignExistence': {'count': 125}, 'GayYoungOldDating': {'count': 156}, 'Bible': {'count': 202}, 'haskell': {'count': 154}, 'seduction': {'count': 400}, 'fantasywriters': {'count': 262}, 'HiveOS': {'count': 100}, 'PerkByDaylight': {'count': 15}, 'Hedgehog': {'count': 73}, 'xmen': {'count': 263}, 'HyperRP': {'count': 122}, 'emotestories': {'count': 3}, 'tutanota': {'count': 135}, 'CultoftheFranklin': {'count': 46}, 'langrisser': {'count': 62}, 'CozyGrove': {'count': 61}, 'Sverigesforsvarsmakt': {'count': 12}, 'silverbugbets': {'count': 21}, 'WreckingBallMains': {'count': 5}, 'capitalism_in_decay': {'count': 8}, 'paintdotnet': {'count': 11}, 'u_mawadom118': {'count': 4}, 'xboxfindfriends': {'count': 2}, 'CPTSD': {'count': 540}, 'destiny2': {'count': 318}, 'Wallstreetsilver': {'count': 1013}, 'DestinyTheGame': {'count': 1107}, 'blackopscoldwar': {'count': 400}, 'InstacartShoppers': {'count': 202}, 'RocketLeagueExchange': {'count': 832}, 'apexlegends': {'count': 3265}, 'kansascity': {'count': 53}, 'namenerds': {'count': 235}, 'help': {'count': 152}, 'Kengan_Ashura': {'count': 132}, 'thetagang': {'count': 165}, 'GameSale': {'count': 262}, 'Reduction': {'count': 109}, 'sex': {'count': 906}, 'bostonr4r': {'count': 75}, 'LegendsOfRuneterra': {'count': 231}, 'overlord': {'count': 48}, 'madisonwi': {'count': 53}, 'steelseries': {'count': 79}, 'ClashOfClansRecruit': {'count': 214}, 'CharacterRant': {'count': 55}, 'AirForce': {'count': 94}, 'sexstories': {'count': 92}, 'NameThatSong': {'count': 162}, 'depressed': {'count': 74}, 'ibs': {'count': 150}, '40kLore': {'count': 269}, 'podcasts': {'count': 88}, 'miraculousladybug': {'count': 150}, 'ask': {'count': 224}, 'EverMerge': {'count': 31}, 'TMJ': {'count': 54}, 'BitLifeApp': {'count': 39}, 'FireEmblemHeroes': {'count': 100}, 'software': {'count': 62}, 'ShieldAndroidTV': {'count': 70}, 'GriefSupport': {'count': 125}, 'onewheel': {'count': 37}, 'MensRights': {'count': 80}, 'nhl': {'count': 22}, 'ClashOfClans': {'count': 107}, 'ps3homebrew': {'count': 33}, 'LightNovels': {'count': 77}, 'redsox': {'count': 34}, 'CryptoMarkets': {'count': 44}, 'ugly': {'count': 47}, 'GCXRep': {'count': 12}, 'cscareerquestionsEU': {'count': 65}, 'MindHunter': {'count': 6}, 'starcraft2coop': {'count': 15}, 'nanocurrency': {'count': 1421}, 'ModelCars': {'count': 8}, 'UKJobs': {'count': 30}, 'Netherlands': {'count': 44}, 'clonewars': {'count': 8}, 'Julia': {'count': 11}, 'Prolactinoma': {'count': 9}, 'sofi': {'count': 11}, 'royalfamily': {'count': 6}, 'ConnecticutR4R': {'count': 8}, 'weather': {'count': 5}, 'oneui': {'count': 7}, 'KTM': {'count': 5}, 'Aerials': {'count': 3}, 'seoul': {'count': 2}, 'exjw': {'count': 3281}, 'ModernMagic': {'count': 699}, 'Paladins': {'count': 1242}, 'kdramarecommends': {'count': 1611}, 'hitbtc': {'count': 330}, 'endocrinology': {'count': 75}, 'Bath': {'count': 43}, 'NassauCountyHookups': {'count': 5}, 'feminineboys': {'count': 1248}, 'dreamsmp': {'count': 2018}, 'SquaredCircle': {'count': 2255}, 'Minecraft': {'count': 8753}, 'spirituality': {'count': 1809}, 'Eldenring': {'count': 1471}, 'Sat': {'count': 1172}, 'bonnaroo': {'count': 194}, 'gardening': {'count': 1892}, 'Unemployment': {'count': 6185}, 'mac': {'count': 1847}, 'Bestbuy': {'count': 437}, 'quittingkratom': {'count': 1081}, 'lawschooladmissions': {'count': 3436}, 'NiceHash': {'count': 2135}, 'McMaster': {'count': 815}, 'covidlonghaulers': {'count': 1299}, 'stalker': {'count': 758}, 'MLBTheShow': {'count': 2721}, 'FortniteCompetitive': {'count': 998}, 'dpdr': {'count': 514}, 'appliancerepair': {'count': 720}, 'thomasthetankengine': {'count': 207}, 'delhi': {'count': 217}, 'Huel': {'count': 300}, 'leafs': {'count': 203}, 'HotWheels': {'count': 170}, '90dayfianceuncensored': {'count': 550}, 'Throwers': {'count': 142}, 'Wavyhair': {'count': 270}, 'CryptoHorde': {'count': 128}, 'ShuumatsuNoValkyrie': {'count': 453}, 'TeensMeetTeens': {'count': 432}, 'dbrand': {'count': 108}, 'SLFmeetups': {'count': 18}, '1200isplentyketo': {'count': 48}, 'passive_income': {'count': 211}, 'BroadCity': {'count': 16}, 'RevenantMain': {'count': 71}, 'extrarfl': {'count': 25}, 'AgonGame': {'count': 5}, 'FitnessDE': {'count': 3}, 'gaming': {'count': 1277}, 'livesound': {'count': 91}, 'IBO': {'count': 1896}, 'EscapefromTarkov': {'count': 1300}, 'amex': {'count': 145}, 'DMAcademy': {'count': 1411}, 'VinylCollectors': {'count': 556}, 'cardano': {'count': 716}, 'brave_browser': {'count': 159}, 'dating': {'count': 952}, 'OculusQuest': {'count': 942}, 'Superstonk': {'count': 3089}, 'MtF': {'count': 957}, 'findaleague': {'count': 207}, 'Nioh': {'count': 398}, 'IRS': {'count': 715}, 'transgendercirclejerk': {'count': 353}, 'learnmath': {'count': 489}, 'piano': {'count': 263}, 'LeagueConnect': {'count': 216}, 'eu4': {'count': 561}, 'Wordpress': {'count': 345}, 'RoleplayingForReddit': {'count': 31}, 'LOONA': {'count': 89}, 'newtothenavy': {'count': 167}, 'HaircareScience': {'count': 118}, 'appletv': {'count': 167}, 'sissypersonals': {'count': 102}, 'raleigh': {'count': 168}, 'realonlyfansreviews': {'count': 21}, 'AskGames': {'count': 49}, 'PokemonTCG': {'count': 325}, 'controlgame': {'count': 109}, 'GoogleDataStudio': {'count': 16}, 'WhiteWolfRPG': {'count': 139}, 'MECoOp': {'count': 31}, 'snuffrp': {'count': 46}, 'lockpicking': {'count': 103}, 'wicked_edge': {'count': 105}, 'BMW': {'count': 99}, 'choiceofgames': {'count': 24}, 'hisdarkmaterials': {'count': 12}, 'SakuraGakuin': {'count': 24}, 'detrans': {'count': 55}, 'Smallville': {'count': 37}, 'kingofqueens': {'count': 7}, 'JamesHoffmann': {'count': 22}, 'stashinvest': {'count': 16}, 'ABA': {'count': 79}, 'ladybusiness': {'count': 10}, 'gamegrumps': {'count': 32}, 'GodEater': {'count': 21}, 'tomorrow': {'count': 39}, 'Tomorrowland': {'count': 9}, 'BlackCountryNewRoad': {'count': 5}, 'STAYC': {'count': 3}, 'SatoshiStreetBets': {'count': 3828}, 'AskLosAngeles': {'count': 1036}, 'buildapcforme': {'count': 1689}, 'ApplyingToCollege': {'count': 10675}, 'watercooling': {'count': 1209}, 'BreakUps': {'count': 4914}, 'FIFA': {'count': 3811}, 'emacs': {'count': 712}, 'trakstocks': {'count': 691}, 'Shittyaskflying': {'count': 147}, 'AmazonFC': {'count': 1178}, 'stocks': {'count': 4610}, 'BangaloreMains': {'count': 26}, 'pokemon': {'count': 3953}, 'religion': {'count': 684}, 'cuboulder': {'count': 269}, 'self': {'count': 1688}, 'tarot': {'count': 912}, 'turtles': {'count': 49}, 'TheMagnusArchives': {'count': 300}, 'Superhero_Ideas': {'count': 34}, 'NTU': {'count': 308}, 'touhou': {'count': 623}, 'JoJolion': {'count': 50}, 'lasers': {'count': 27}, 'popperpigs': {'count': 67}, 'aggretsuko': {'count': 20}, 'Library': {'count': 5}}}} |
+| [RenderedSST2](https://huggingface.co/datasets/clip-benchmark/wds_renderedsst2) | ['eng'] | ZeroShotClassification | i2t | [Reviews] | None | None |
| [RestaurantReviewSentimentClassification](https://link.springer.com/chapter/10.1007/978-3-319-18117-2_2) (ElSahar et al., 2015) | ['ara'] | Classification | s2s | [Reviews, Written] | None | None |
| [RiaNewsRetrieval](https://arxiv.org/abs/1901.07786) (Gavrilov et al., 2019) | ['rus'] | Retrieval | s2p | [News, Written] | None | None |
| [RiaNewsRetrievalHardNegatives](https://arxiv.org/abs/1901.07786) (Gavrilov et al., 2019) | ['rus'] | Retrieval | s2p | [News, Written] | None | None |
@@ -538,18 +632,30 @@ The following tables give you an overview of the tasks in MTEB.
| [SNLHierarchicalClusteringP2P](https://huggingface.co/datasets/navjordj/SNL_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | p2p | [Encyclopaedic, Non-fiction, Written] | None | None |
| [SNLHierarchicalClusteringS2S](https://huggingface.co/datasets/navjordj/SNL_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | s2s | [Encyclopaedic, Non-fiction, Written] | None | None |
| [SNLRetrieval](https://huggingface.co/datasets/navjordj/SNL_summarization) (Navjord et al., 2023) | ['nob'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Written] | None | None |
+| [SOPI2IRetrieval](https://paperswithcode.com/dataset/stanford-online-products) (Oh Song et al., 2016) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | None | None |
| [SRNCorpusBitextMining](https://arxiv.org/abs/2212.06383) (Zwennicker et al., 2022) | ['nld', 'srn'] | BitextMining | s2s | [Social, Web, Written] | None | None |
+| [STL10](https://cs.stanford.edu/~acoates/stl10/) (Coates et al., 2011) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None |
+| [STL10ZeroShot](https://cs.stanford.edu/~acoates/stl10/) (Coates et al., 2011) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None |
| [STS12](https://www.aclweb.org/anthology/S12-1051.pdf) (Agirre et al., 2012) | ['eng'] | STS | s2s | [Encyclopaedic, News, Written] | {'test': 3108} | {'test': {'num_samples': 3108, 'number_of_characters': 402118, 'min_sentence1_length': 3, 'average_sentence1_len': 63.79, 'max_sentence1_length': 220, 'unique_sentence1': 2236, 'min_sentence2_length': 7, 'average_sentence2_len': 65.59, 'max_sentence2_length': 204, 'unique_sentence2': 2797, 'min_score': 0.0, 'avg_score': 3.51, 'max_score': 5.0}} |
+| [STS12VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS | i2i | [Encyclopaedic, News, Written] | None | None |
| [STS13](https://www.aclweb.org/anthology/S13-1004/) (Eneko Agirre, 2013) | ['eng'] | STS | s2s | [News, Non-fiction, Web, Written] | None | None |
+| [STS13VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS | i2i | [News, Non-fiction, Web, Written] | None | None |
| [STS14](https://www.aclweb.org/anthology/S14-1002) | ['eng'] | STS | s2s | [Blog, Spoken, Web] | None | None |
+| [STS14VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS | i2i | [Blog, Spoken, Web] | None | None |
| [STS15](https://www.aclweb.org/anthology/S15-2010) | ['eng'] | STS | s2s | [Blog, News, Spoken, Web, Written] | None | None |
+| [STS15VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS | i2i | [Blog, News, Spoken, Web, Written] | None | None |
| [STS16](https://www.aclweb.org/anthology/S16-1001) | ['eng'] | STS | s2s | [Blog, Spoken, Web] | None | None |
+| [STS16VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS | i2i | [Blog, Spoken, Web] | None | None |
| [STS17](https://alt.qcri.org/semeval2017/task1/) | ['ara', 'deu', 'eng', 'fra', 'ita', 'kor', 'nld', 'spa', 'tur'] | STS | s2s | [News, Web, Written] | {'test': 5346} | {'test': {'num_samples': 5346, 'number_of_characters': 400264, 'min_sentence1_length': 6, 'average_sentence1_len': 38.15, 'max_sentence1_length': 976, 'unique_sentence1': 4900, 'min_sentence2_length': 6, 'average_sentence2_len': 36.73, 'max_sentence2_length': 1007, 'unique_sentence2': 4470, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0, 'hf_subset_descriptive_stats': {'ko-ko': {'num_samples': 2846, 'number_of_characters': 183387, 'min_sentence1_length': 6, 'average_sentence1_len': 31.99, 'max_sentence1_length': 976, 'unique_sentence1': 2650, 'min_sentence2_length': 6, 'average_sentence2_len': 32.44, 'max_sentence2_length': 1007, 'unique_sentence2': 2720, 'min_score': 0.0, 'avg_score': 2.47, 'max_score': 5.0}, 'ar-ar': {'num_samples': 250, 'number_of_characters': 16247, 'min_sentence1_length': 11, 'average_sentence1_len': 32.21, 'max_sentence1_length': 99, 'unique_sentence1': 250, 'min_sentence2_length': 9, 'average_sentence2_len': 32.78, 'max_sentence2_length': 83, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.22, 'max_score': 5.0}, 'en-ar': {'num_samples': 250, 'number_of_characters': 18764, 'min_sentence1_length': 13, 'average_sentence1_len': 42.36, 'max_sentence1_length': 105, 'unique_sentence1': 250, 'min_sentence2_length': 10, 'average_sentence2_len': 32.7, 'max_sentence2_length': 104, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.14, 'max_score': 5.0}, 'en-de': {'num_samples': 250, 'number_of_characters': 22177, 'min_sentence1_length': 12, 'average_sentence1_len': 43.95, 'max_sentence1_length': 94, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 44.76, 'max_sentence2_length': 104, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'en-en': {'num_samples': 250, 'number_of_characters': 21669, 'min_sentence1_length': 12, 'average_sentence1_len': 43.95, 'max_sentence1_length': 94, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'en-tr': {'num_samples': 250, 'number_of_characters': 20879, 'min_sentence1_length': 15, 'average_sentence1_len': 41.92, 'max_sentence1_length': 101, 'unique_sentence1': 250, 'min_sentence2_length': 10, 'average_sentence2_len': 41.6, 'max_sentence2_length': 107, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.13, 'max_score': 5.0}, 'es-en': {'num_samples': 250, 'number_of_characters': 23216, 'min_sentence1_length': 12, 'average_sentence1_len': 50.84, 'max_sentence1_length': 160, 'unique_sentence1': 250, 'min_sentence2_length': 14, 'average_sentence2_len': 42.02, 'max_sentence2_length': 117, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.15, 'max_score': 5.0}, 'es-es': {'num_samples': 250, 'number_of_characters': 25265, 'min_sentence1_length': 18, 'average_sentence1_len': 49.84, 'max_sentence1_length': 136, 'unique_sentence1': 250, 'min_sentence2_length': 13, 'average_sentence2_len': 51.22, 'max_sentence2_length': 129, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.23, 'max_score': 5.0}, 'fr-en': {'num_samples': 250, 'number_of_characters': 23087, 'min_sentence1_length': 19, 'average_sentence1_len': 49.62, 'max_sentence1_length': 115, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'it-en': {'num_samples': 250, 'number_of_characters': 23188, 'min_sentence1_length': 15, 'average_sentence1_len': 50.03, 'max_sentence1_length': 113, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'nl-en': {'num_samples': 250, 'number_of_characters': 22385, 'min_sentence1_length': 14, 'average_sentence1_len': 46.82, 'max_sentence1_length': 123, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}}}} |
+| [STS17MultilingualVisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['ara', 'deu', 'eng', 'fra', 'ita', 'kor', 'nld', 'spa', 'tur'] | VisualSTS | i2i | [News, Social, Spoken, Web, Written] | None | None |
| [STS22.v2](https://competitions.codalab.org/competitions/33835) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'pol', 'rus', 'spa', 'tur'] | STS | p2p | [News, Written] | None | None |
| [STSB](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None |
| [STSBenchmark](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['eng'] | STS | s2s | [Blog, News, Written] | None | None |
| [STSBenchmarkMultilingualSTS](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['cmn', 'deu', 'eng', 'fra', 'ita', 'nld', 'pol', 'por', 'rus', 'spa'] | STS | s2s | [News, Social, Spoken, Web, Written] | None | None |
+| [STSBenchmarkMultilingualVisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['cmn', 'deu', 'eng', 'fra', 'ita', 'nld', 'pol', 'por', 'rus', 'spa'] | VisualSTS | i2i | [News, Social, Spoken, Web, Written] | None | None |
| [STSES](https://huggingface.co/datasets/PlanTL-GOB-ES/sts-es) (Agirre et al., 2015) | ['spa'] | STS | s2s | [Written] | None | None |
+| [SUN397](https://ieeexplore.ieee.org/abstract/document/5539970) (Xiao et al., 2010) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None |
+| [SUN397ZeroShot](https://ieeexplore.ieee.org/abstract/document/5539970) (Xiao et al., 2010) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None |
| [SadeemQuestionRetrieval](https://huggingface.co/datasets/sadeem-ai/sadeem-ar-eval-retrieval-questions) | ['ara'] | Retrieval | s2p | [Written, Written] | None | None |
| [SanskritShlokasClassification](https://github.com/goru001/nlp-for-sanskrit) | ['san'] | Classification | s2s | [Religious, Written] | None | None |
| [ScalaClassification](https://aclanthology.org/2023.nodalida-1.20/) | ['dan', 'nno', 'nob', 'swe'] | Classification | s2s | [Blog, Fiction, News, Non-fiction, Spoken, Web, Written] | None | None |
@@ -557,6 +663,9 @@ The following tables give you an overview of the tasks in MTEB.
| [SciFact](https://github.com/allenai/scifact) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | None | None |
| [SciFact-Fa](https://huggingface.co/datasets/MCINext/scifact-fa) | ['fas'] | Retrieval | s2p | [Academic] | None | None |
| [SciFact-PL](https://github.com/allenai/scifact) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Academic, Medical, Written] | None | None |
+| [SciMMIR](https://huggingface.co/datasets/m-a-p/SciMMIR) (Siwei Wu, 2024) | ['eng'] | ZeroShotClassification | i2t | [Academic] | None | None |
+| [SciMMIRI2TRetrieval](https://aclanthology.org/2024.findings-acl.746/) (Wu et al., 2024) | ['eng'] | Any2AnyRetrieval | i2t | [Academic] | None | None |
+| [SciMMIRT2IRetrieval](https://aclanthology.org/2024.findings-acl.746/) (Wu et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None |
| [SemRel24STS](https://huggingface.co/datasets/SemRel/SemRel2024) (Nedjma Ousidhoum, 2024) | ['afr', 'amh', 'arb', 'arq', 'ary', 'eng', 'hau', 'hin', 'ind', 'kin', 'mar', 'tel'] | STS | s2s | [Spoken, Written] | None | None |
| [SensitiveTopicsClassification](https://aclanthology.org/2021.bsnlp-1.4) | ['rus'] | MultilabelClassification | s2s | [Social, Web, Written] | None | None |
| [SentimentAnalysisHindi](https://huggingface.co/datasets/OdiaGenAI/sentiment_analysis_hindi) (Shantipriya Parida, 2023) | ['hin'] | Classification | s2s | [Reviews, Written] | None | None |
@@ -564,6 +673,7 @@ The following tables give you an overview of the tasks in MTEB.
| [SinhalaNewsClassification](https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Category-classification) (Nisansa de Silva, 2015) | ['sin'] | Classification | s2s | [News, Written] | None | None |
| [SinhalaNewsSourceClassification](https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Source-classification) (Dhananjaya et al., 2022) | ['sin'] | Classification | s2s | [News, Written] | None | None |
| [SiswatiNewsClassification](https://huggingface.co/datasets/dsfsi/za-isizulu-siswati-news) (Madodonga et al., 2023) | ['ssw'] | Classification | s2s | [News, Written] | None | None |
+| [SketchyI2IRetrieval](https://arxiv.org/abs/2202.01747) (Ypsilantis et al., 2021) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | None | None |
| [SlovakHateSpeechClassification](https://huggingface.co/datasets/TUKE-KEMT/hate_speech_slovak) | ['slk'] | Classification | s2s | [Social, Written] | {'test': 1319, 'train': 11870} | {'test': {'num_samples': 1319, 'number_of_characters': 122279, 'num_texts_in_train': 46, 'min_text_length': 8, 'average_text_length': 92.71, 'max_text_length': 1584, 'unique_text': 1315, 'unique_labels': 2, 'labels': {'1': {'count': 360}, '0': {'count': 959}}}, 'train': {'num_samples': 11870, 'number_of_characters': 1130860, 'num_texts_in_train': None, 'min_text_length': 7, 'average_text_length': 95.27, 'max_text_length': 2112, 'unique_text': 11655, 'unique_labels': 2, 'labels': {'1': {'count': 3245}, '0': {'count': 8625}}}} |
| [SlovakMovieReviewSentimentClassification](https://arxiv.org/pdf/2304.01922) ({{S, 2023) | ['svk'] | Classification | s2s | [Reviews, Written] | None | None |
| [SlovakSumRetrieval](https://huggingface.co/datasets/NaiveNeuron/slovaksum) | ['slk'] | Retrieval | s2s | [News, Social, Web, Written] | None | None |
@@ -579,7 +689,11 @@ The following tables give you an overview of the tasks in MTEB.
| [StackExchangeClusteringP2P.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | p2p | [Web, Written] | None | None |
| [StackOverflowDupQuestions](https://www.microsoft.com/en-us/research/uploads/prod/2019/03/nl4se18LinkSO.pdf) (Xueqing Liu, 2018) | ['eng'] | Reranking | s2s | [Blog, Programming, Written] | None | None |
| [StackOverflowQA](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 21925} | {'test': {'number_of_characters': 26584028, 'num_samples': 21925, 'num_queries': 1994, 'num_documents': 19931, 'min_document_length': 61, 'average_document_length': 130.32, 'max_document_length': 22234, 'unique_documents': 19931, 'min_query_length': 5, 'average_query_length': 12029.38, 'max_query_length': 46028, 'unique_queries': 1994, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1994}} |
+| [StanfordCars](https://pure.mpg.de/rest/items/item_2029263/component/file_2029262/content) (Jonathan Krause, 2013) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None |
+| [StanfordCarsI2IRetrieval](https://pure.mpg.de/rest/items/item_2029263/component/file_2029262/content) (Jonathan Krause, 2013) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | None | None |
+| [StanfordCarsZeroShot](https://pure.mpg.de/rest/items/item_2029263/component/file_2029262/content) (Jonathan Krause, 2013) | ['eng'] | ZeroShotClassification | i2t | [Scene] | None | None |
| [StatcanDialogueDatasetRetrieval](https://mcgill-nlp.github.io/statcan-dialogue-dataset/) | ['eng', 'fra'] | Retrieval | s2p | [Government, Web, Written] | None | None |
+| [SugarCrepe](https://proceedings.neurips.cc/paper_files/paper/2023/hash/63461de0b4cb760fc498e85b18a7fe81-Abstract-Datasets_and_Benchmarks.html) (Hsieh et al., 2024) | ['eng'] | ImageTextPairClassification | i2t | [Encyclopaedic] | None | None |
| [SummEvalFrSummarization.v2](https://github.com/Yale-LILY/SummEval) (Fabbri et al., 2020) | ['fra'] | Summarization | p2p | [News, Written] | None | None |
| [SummEvalSummarization.v2](https://github.com/Yale-LILY/SummEval) (Fabbri et al., 2020) | ['eng'] | Summarization | p2p | [News, Written] | None | None |
| [SwahiliNewsClassification](https://huggingface.co/datasets/Mollel/SwahiliNewsClassification) | ['swa'] | Classification | s2s | [News, Written] | None | None |
@@ -627,6 +741,7 @@ The following tables give you an overview of the tasks in MTEB.
| [TRECCOVID](https://ir.nist.gov/covidSubmit/index.html) (Kirk Roberts, 2021) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | None | None |
| [TRECCOVID-Fa](https://huggingface.co/datasets/MCINext/trec-covid-fa) | ['fas'] | Retrieval | s2p | [Medical] | None | None |
| [TRECCOVID-PL](https://ir.nist.gov/covidSubmit/index.html) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Academic, Medical, Non-fiction, Written] | None | None |
+| [TUBerlinT2IRetrieval](https://dl.acm.org/doi/pdf/10.1145/2185520.2185540?casa_token=tq-eUx5UROYAAAAA:_694nPzE7tali6LCkxQc0M-mlo9xslasPMcVnFPMy9tDfvt7lg7p1RTe-k8VWCjuv9gmkQqasKUZ) (Eitz et al., 2012) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | None | None |
| [TV2Nordretrieval](https://huggingface.co/datasets/alexandrainst/nordjylland-news-summarization) | ['dan'] | Retrieval | p2p | [News, Non-fiction, Written] | None | None |
| [TamilNewsClassification](https://github.com/vanangamudi/tamil-news-classification) (Anoop Kunchukuttan, 2020) | ['tam'] | Classification | s2s | [News, Written] | None | None |
| [Tatoeba](https://github.com/facebookresearch/LASER/tree/main/data/tatoeba/v1) (Tatoeba community, 2021) | ['afr', 'amh', 'ang', 'ara', 'arq', 'arz', 'ast', 'awa', 'aze', 'bel', 'ben', 'ber', 'bos', 'bre', 'bul', 'cat', 'cbk', 'ceb', 'ces', 'cha', 'cmn', 'cor', 'csb', 'cym', 'dan', 'deu', 'dsb', 'dtp', 'ell', 'eng', 'epo', 'est', 'eus', 'fao', 'fin', 'fra', 'fry', 'gla', 'gle', 'glg', 'gsw', 'heb', 'hin', 'hrv', 'hsb', 'hun', 'hye', 'ido', 'ile', 'ina', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kat', 'kaz', 'khm', 'kor', 'kur', 'kzj', 'lat', 'lfn', 'lit', 'lvs', 'mal', 'mar', 'max', 'mhr', 'mkd', 'mon', 'nds', 'nld', 'nno', 'nob', 'nov', 'oci', 'orv', 'pam', 'pes', 'pms', 'pol', 'por', 'ron', 'rus', 'slk', 'slv', 'spa', 'sqi', 'srp', 'swe', 'swg', 'swh', 'tam', 'tat', 'tel', 'tgl', 'tha', 'tuk', 'tur', 'tzl', 'uig', 'ukr', 'urd', 'uzb', 'vie', 'war', 'wuu', 'xho', 'yid', 'yue', 'zsm'] | BitextMining | s2s | [Written] | None | None |
@@ -647,6 +762,7 @@ The following tables give you an overview of the tasks in MTEB.
| [TextualismToolPlainLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
| [ThuNewsClusteringP2P.v2](http://thuctc.thunlp.org/) (Sun et al., 2016) | ['cmn'] | Clustering | p2p | [News, Written] | None | None |
| [ThuNewsClusteringS2S.v2](http://thuctc.thunlp.org/) (Sun et al., 2016) | ['cmn'] | Clustering | s2s | [News, Written] | None | None |
+| [TinyImageNetClustering](https://huggingface.co/datasets/zh-plus/tiny-imagenet/viewer/default/valid) | ['eng'] | ImageClustering | i2i | [Reviews] | None | None |
| [TopiOCQA](https://mcgill-nlp.github.io/topiocqa) (Vaibhav Adlakha, 2022) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None |
| [TopiOCQAHardNegatives](https://mcgill-nlp.github.io/topiocqa) (Vaibhav Adlakha, 2022) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None |
| [Touche2020-Fa](https://huggingface.co/datasets/MCINext/touche2020-fa) | ['fas'] | Retrieval | s2p | [Spoken] | None | None |
@@ -668,19 +784,39 @@ The following tables give you an overview of the tasks in MTEB.
| [TwitterSemEval2015](https://alt.qcri.org/semeval2015/task1/) | ['eng'] | PairClassification | s2s | [Social, Written] | None | None |
| [TwitterURLCorpus](https://languagenet.github.io/) | ['eng'] | PairClassification | s2s | [Social, Written] | {'test': 51534} | {'test': {'num_samples': 51534, 'number_of_characters': 8659940, 'min_sentence1_length': 24, 'avg_sentence1_length': 79.49, 'max_sentence1_length': 126, 'unique_sentence1': 4329, 'min_sentence2_length': 6, 'avg_sentence2_length': 88.55, 'max_sentence2_length': 608, 'unique_sentence2': 41304, 'unique_labels': 2, 'labels': {'0': {'count': 38546}, '1': {'count': 12988}}}} |
| [UCCVCommonLawLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
+| [UCF101](https://huggingface.co/datasets/flwrlabs/ucf101) (Khurram Soomro, 2012) | ['eng'] | ImageClassification | i2i | [Scene] | None | None |
+| [UCF101ZeroShot](https://huggingface.co/datasets/flwrlabs/ucf101) (Khurram Soomro, 2012) | ['eng'] | ZeroShotClassification | i2t | [Scene] | None | None |
| [UkrFormalityClassification](https://huggingface.co/datasets/ukr-detect/ukr-formality-dataset-translated-gyafc) | ['ukr'] | Classification | s2s | [News, Written] | None | None |
| [UnfairTOSLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None |
| [UrduRomanSentimentClassification](https://archive.ics.uci.edu/dataset/458/roman+urdu+data+set) (Sharf,Zareen, 2018) | ['urd'] | Classification | s2s | [Social, Written] | None | None |
| [VGHierarchicalClusteringP2P](https://huggingface.co/datasets/navjordj/VG_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | p2p | [News, Non-fiction, Written] | None | None |
| [VGHierarchicalClusteringS2S](https://huggingface.co/datasets/navjordj/VG_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | p2p | [News, Non-fiction, Written] | None | None |
+| [VOC2007](http://host.robots.ox.ac.uk/pascal/VOC/) | ['eng'] | ImageMultilabelClassification | i2i | [Encyclopaedic] | None | None |
+| [VQA2IT2TRetrieval](https://openaccess.thecvf.com/content_cvpr_2017/html/Goyal_Making_the_v_CVPR_2017_paper.html) (Goyal et al., 2017) | ['eng'] | Any2AnyRetrieval | it2t | [Web] | None | None |
| [VideoRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | None |
+| [VidoreArxivQARetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None |
+| [VidoreDocVQARetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None |
+| [VidoreInfoVQARetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None |
+| [VidoreShiftProjectRetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None |
+| [VidoreSyntheticDocQAAIRetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None |
+| [VidoreSyntheticDocQAEnergyRetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None |
+| [VidoreSyntheticDocQAGovernmentReportsRetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None |
+| [VidoreSyntheticDocQAHealthcareIndustryRetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None |
+| [VidoreTabfquadRetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None |
+| [VidoreTatdqaRetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None |
| [VieMedEVBitextMining](https://aclanthology.org/2015.iwslt-evaluation.11/) (Nhu Vo, 2024) | ['eng', 'vie'] | BitextMining | s2s | [Medical, Written] | {'test': 2048} | {'test': {'num_samples': 2048, 'number_of_characters': 575910, 'unique_pairs': 2048, 'min_sentence1_length': 11, 'average_sentence1_length': 139.23, 'max_sentence1_length': 1291, 'unique_sentence1': 2048, 'min_sentence2_length': 11, 'average_sentence2_length': 141.98, 'max_sentence2_length': 1217, 'unique_sentence2': 2047}} |
| [VieQuADRetrieval](https://aclanthology.org/2020.coling-main.233.pdf) | ['vie'] | Retrieval | s2p | [Encyclopaedic, Non-fiction, Written] | None | None |
| [VieStudentFeedbackClassification](https://ieeexplore.ieee.org/document/8573337) (Nguyen et al., 2018) | ['vie'] | Classification | s2s | [Reviews, Written] | None | None |
+| [VisualNewsI2TRetrieval](https://aclanthology.org/2021.emnlp-main.542/) (Liu et al., 2021) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | None | None |
+| [VisualNewsT2IRetrieval](https://aclanthology.org/2021.emnlp-main.542/) (Liu et al., 2021) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | None | None |
+| [VizWizIT2TRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/papers/Gurari_VizWiz_Grand_Challenge_CVPR_2018_paper.pdf) (Gurari et al., 2018) | ['eng'] | Any2AnyRetrieval | it2t | [Web] | None | None |
| [VoyageMMarcoReranking](https://arxiv.org/abs/2312.16144) (Benjamin Clavié, 2023) | ['jpn'] | Reranking | s2s | [Academic, Non-fiction, Written] | None | None |
+| [WITT2IRetrieval](https://proceedings.mlr.press/v162/bugliarello22a/bugliarello22a.pdf) (Bugliarello et al., 2022) | ['ara', 'bul', 'dan', 'ell', 'eng', 'est', 'ind', 'jpn', 'kor', 'tur', 'vie'] | Any2AnyRetrieval | t2i | [Encyclopaedic, Written] | None | None |
| [WRIMEClassification](https://aclanthology.org/2021.naacl-main.169/) | ['jpn'] | Classification | s2s | [Social, Written] | None | None |
| [Waimai](https://aclanthology.org/2023.nodalida-1.20/) (Xiao et al., 2023) | ['cmn'] | Classification | s2s | | None | None |
| [WebLINXCandidatesReranking](https://mcgill-nlp.github.io/weblinx) (Xing Han Lù, 2024) | ['eng'] | Reranking | p2p | [Academic, Web, Written] | None | None |
+| [WebQAT2ITRetrieval](https://openaccess.thecvf.com/content/CVPR2022/html/Chang_WebQA_Multihop_and_Multimodal_QA_CVPR_2022_paper.html) (Chang et al., 2022) | ['eng'] | Any2AnyRetrieval | t2it | [Encyclopaedic] | None | None |
+| [WebQAT2TRetrieval](https://openaccess.thecvf.com/content/CVPR2022/html/Chang_WebQA_Multihop_and_Multimodal_QA_CVPR_2022_paper.html) (Chang et al., 2022) | ['eng'] | Any2AnyRetrieval | t2t | [Encyclopaedic] | None | None |
| [WikiCitiesClustering](https://huggingface.co/datasets/wikipedia) | ['eng'] | Clustering | p2p | [Encyclopaedic, Written] | None | None |
| [WikiClusteringP2P.v2](https://github.com/Rysias/wiki-clustering) | ['bos', 'cat', 'ces', 'dan', 'eus', 'glv', 'ilo', 'kur', 'lav', 'min', 'mlt', 'sco', 'sqi', 'wln'] | Clustering | p2p | [Encyclopaedic, Written] | None | None |
| [WikipediaBioMetChemClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None |
@@ -703,7 +839,10 @@ The following tables give you an overview of the tasks in MTEB.
| [WikipediaSpecialtiesInChemistryClustering](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Clustering | s2p | [Chemistry] | None | None |
| [WikipediaTheoreticalAppliedClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None |
| [WinoGrande](https://winogrande.allenai.org/) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None |
+| [Winoground](https://openaccess.thecvf.com/content/CVPR2022/html/Thrush_Winoground_Probing_Vision_and_Language_Models_for_Visio-Linguistic_Compositionality_CVPR_2022_paper) (Tristan Thrush, 2022) | ['eng'] | ImageTextPairClassification | i2t | [Social] | None | None |
| [WisesightSentimentClassification](https://github.com/PyThaiNLP/wisesight-sentiment) | ['tha'] | Classification | s2s | [News, Social, Written] | None | None |
+| [XFlickr30kCoT2IRetrieval](https://proceedings.mlr.press/v162/bugliarello22a/bugliarello22a.pdf) (Bugliarello et al., 2022) | ['deu', 'eng', 'ind', 'jpn', 'rus', 'spa', 'tur', 'zho'] | Any2AnyRetrieval | t2i | [Encyclopaedic, Written] | None | None |
+| [XM3600T2IRetrieval](https://aclanthology.org/2022.emnlp-main.45/) (Thapliyal et al., 2022) | ['ara', 'ben', 'ces', 'dan', 'deu', 'ell', 'eng', 'fas', 'fil', 'fin', 'fra', 'heb', 'hin', 'hrv', 'hun', 'ind', 'ita', 'jpn', 'kor', 'mri', 'nld', 'nor', 'pol', 'por', 'quz', 'ron', 'rus', 'spa', 'swa', 'swe', 'tel', 'tha', 'tur', 'ukr', 'vie', 'zho'] | Any2AnyRetrieval | t2i | [Encyclopaedic, Written] | None | None |
| XMarket (Bonab et al., 2021) | ['deu', 'eng', 'spa'] | Retrieval | s2p | | None | None |
| [XNLI](https://aclanthology.org/D18-1269/) (Conneau et al., 2018) | ['ara', 'bul', 'deu', 'ell', 'eng', 'fra', 'hin', 'rus', 'spa', 'swa', 'tha', 'tur', 'vie', 'zho'] | PairClassification | s2s | [Fiction, Government, Non-fiction, Written] | {'test': 19110, 'validation': 19110} | {'test': {'num_samples': 19110, 'number_of_characters': 2907145, 'min_sentence1_length': 3, 'avg_sentence1_length': 103.24, 'max_sentence1_length': 401, 'unique_sentence1': 15328, 'min_sentence2_length': 2, 'avg_sentence2_length': 48.89, 'max_sentence2_length': 187, 'unique_sentence2': 19104, 'unique_labels': 2, 'labels': {'0': {'count': 9562}, '1': {'count': 9548}}, 'hf_subset_descriptive_stats': {'ar': {'num_samples': 1365, 'number_of_characters': 179591, 'min_sentence1_length': 11, 'avg_sentence1_length': 89.57, 'max_sentence1_length': 242, 'unique_sentence1': 1095, 'min_sentence2_length': 8, 'avg_sentence2_length': 41.99, 'max_sentence2_length': 115, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'bg': {'num_samples': 1365, 'number_of_characters': 220646, 'min_sentence1_length': 14, 'avg_sentence1_length': 110.02, 'max_sentence1_length': 303, 'unique_sentence1': 1095, 'min_sentence2_length': 8, 'avg_sentence2_length': 51.63, 'max_sentence2_length': 150, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'de': {'num_samples': 1365, 'number_of_characters': 241224, 'min_sentence1_length': 3, 'avg_sentence1_length': 119.93, 'max_sentence1_length': 301, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 56.79, 'max_sentence2_length': 187, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'el': {'num_samples': 1365, 'number_of_characters': 240222, 'min_sentence1_length': 13, 'avg_sentence1_length': 119.05, 'max_sentence1_length': 344, 'unique_sentence1': 1095, 'min_sentence2_length': 13, 'avg_sentence2_length': 56.93, 'max_sentence2_length': 172, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'en': {'num_samples': 1365, 'number_of_characters': 212223, 'min_sentence1_length': 19, 'avg_sentence1_length': 105.67, 'max_sentence1_length': 268, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 49.8, 'max_sentence2_length': 137, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'es': {'num_samples': 1365, 'number_of_characters': 232207, 'min_sentence1_length': 11, 'avg_sentence1_length': 115.43, 'max_sentence1_length': 385, 'unique_sentence1': 1094, 'min_sentence2_length': 8, 'avg_sentence2_length': 54.68, 'max_sentence2_length': 163, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'fr': {'num_samples': 1365, 'number_of_characters': 245259, 'min_sentence1_length': 9, 'avg_sentence1_length': 121.1, 'max_sentence1_length': 327, 'unique_sentence1': 1095, 'min_sentence2_length': 10, 'avg_sentence2_length': 58.58, 'max_sentence2_length': 169, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'hi': {'num_samples': 1365, 'number_of_characters': 211312, 'min_sentence1_length': 16, 'avg_sentence1_length': 104.63, 'max_sentence1_length': 401, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 50.17, 'max_sentence2_length': 162, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'ru': {'num_samples': 1365, 'number_of_characters': 222797, 'min_sentence1_length': 11, 'avg_sentence1_length': 110.77, 'max_sentence1_length': 306, 'unique_sentence1': 1095, 'min_sentence2_length': 8, 'avg_sentence2_length': 52.45, 'max_sentence2_length': 167, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'sw': {'num_samples': 1365, 'number_of_characters': 210103, 'min_sentence1_length': 10, 'avg_sentence1_length': 104.44, 'max_sentence1_length': 266, 'unique_sentence1': 1094, 'min_sentence2_length': 2, 'avg_sentence2_length': 49.48, 'max_sentence2_length': 146, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'th': {'num_samples': 1365, 'number_of_characters': 192788, 'min_sentence1_length': 12, 'avg_sentence1_length': 96.69, 'max_sentence1_length': 262, 'unique_sentence1': 1095, 'min_sentence2_length': 6, 'avg_sentence2_length': 44.54, 'max_sentence2_length': 129, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'tr': {'num_samples': 1365, 'number_of_characters': 208658, 'min_sentence1_length': 15, 'avg_sentence1_length': 103.68, 'max_sentence1_length': 255, 'unique_sentence1': 1095, 'min_sentence2_length': 6, 'avg_sentence2_length': 49.19, 'max_sentence2_length': 140, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'vi': {'num_samples': 1365, 'number_of_characters': 223549, 'min_sentence1_length': 14, 'avg_sentence1_length': 111.31, 'max_sentence1_length': 265, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 52.46, 'max_sentence2_length': 143, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'zh': {'num_samples': 1365, 'number_of_characters': 66566, 'min_sentence1_length': 4, 'avg_sentence1_length': 33.04, 'max_sentence1_length': 112, 'unique_sentence1': 1095, 'min_sentence2_length': 3, 'avg_sentence2_length': 15.73, 'max_sentence2_length': 59, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}}}, 'validation': {'num_samples': 19110, 'number_of_characters': 2909058, 'min_sentence1_length': 5, 'avg_sentence1_length': 103.21, 'max_sentence1_length': 323, 'unique_sentence1': 11171, 'min_sentence2_length': 3, 'avg_sentence2_length': 49.02, 'max_sentence2_length': 172, 'unique_sentence2': 19101, 'unique_labels': 2, 'labels': {'0': {'count': 9562}, '1': {'count': 9548}}, 'hf_subset_descriptive_stats': {'ar': {'num_samples': 1365, 'number_of_characters': 177355, 'min_sentence1_length': 13, 'avg_sentence1_length': 88.32, 'max_sentence1_length': 214, 'unique_sentence1': 798, 'min_sentence2_length': 6, 'avg_sentence2_length': 41.61, 'max_sentence2_length': 137, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'bg': {'num_samples': 1365, 'number_of_characters': 219988, 'min_sentence1_length': 16, 'avg_sentence1_length': 109.2, 'max_sentence1_length': 316, 'unique_sentence1': 798, 'min_sentence2_length': 10, 'avg_sentence2_length': 51.97, 'max_sentence2_length': 151, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'de': {'num_samples': 1365, 'number_of_characters': 241852, 'min_sentence1_length': 20, 'avg_sentence1_length': 119.81, 'max_sentence1_length': 298, 'unique_sentence1': 798, 'min_sentence2_length': 12, 'avg_sentence2_length': 57.37, 'max_sentence2_length': 162, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'el': {'num_samples': 1365, 'number_of_characters': 241275, 'min_sentence1_length': 16, 'avg_sentence1_length': 119.88, 'max_sentence1_length': 302, 'unique_sentence1': 798, 'min_sentence2_length': 6, 'avg_sentence2_length': 56.88, 'max_sentence2_length': 171, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'en': {'num_samples': 1365, 'number_of_characters': 212384, 'min_sentence1_length': 20, 'avg_sentence1_length': 105.72, 'max_sentence1_length': 271, 'unique_sentence1': 798, 'min_sentence2_length': 8, 'avg_sentence2_length': 49.88, 'max_sentence2_length': 139, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'es': {'num_samples': 1365, 'number_of_characters': 232451, 'min_sentence1_length': 14, 'avg_sentence1_length': 115.17, 'max_sentence1_length': 265, 'unique_sentence1': 798, 'min_sentence2_length': 7, 'avg_sentence2_length': 55.12, 'max_sentence2_length': 148, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'fr': {'num_samples': 1365, 'number_of_characters': 246857, 'min_sentence1_length': 19, 'avg_sentence1_length': 121.76, 'max_sentence1_length': 323, 'unique_sentence1': 798, 'min_sentence2_length': 11, 'avg_sentence2_length': 59.09, 'max_sentence2_length': 172, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'hi': {'num_samples': 1365, 'number_of_characters': 212269, 'min_sentence1_length': 18, 'avg_sentence1_length': 105.06, 'max_sentence1_length': 277, 'unique_sentence1': 798, 'min_sentence2_length': 7, 'avg_sentence2_length': 50.44, 'max_sentence2_length': 152, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'ru': {'num_samples': 1365, 'number_of_characters': 221152, 'min_sentence1_length': 15, 'avg_sentence1_length': 109.75, 'max_sentence1_length': 310, 'unique_sentence1': 798, 'min_sentence2_length': 8, 'avg_sentence2_length': 52.27, 'max_sentence2_length': 140, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'sw': {'num_samples': 1365, 'number_of_characters': 210482, 'min_sentence1_length': 13, 'avg_sentence1_length': 104.32, 'max_sentence1_length': 264, 'unique_sentence1': 798, 'min_sentence2_length': 8, 'avg_sentence2_length': 49.88, 'max_sentence2_length': 153, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'th': {'num_samples': 1365, 'number_of_characters': 192640, 'min_sentence1_length': 7, 'avg_sentence1_length': 97.28, 'max_sentence1_length': 255, 'unique_sentence1': 798, 'min_sentence2_length': 3, 'avg_sentence2_length': 43.84, 'max_sentence2_length': 140, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'tr': {'num_samples': 1365, 'number_of_characters': 208305, 'min_sentence1_length': 15, 'avg_sentence1_length': 102.97, 'max_sentence1_length': 269, 'unique_sentence1': 798, 'min_sentence2_length': 10, 'avg_sentence2_length': 49.64, 'max_sentence2_length': 139, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'vi': {'num_samples': 1365, 'number_of_characters': 224811, 'min_sentence1_length': 18, 'avg_sentence1_length': 112.26, 'max_sentence1_length': 323, 'unique_sentence1': 798, 'min_sentence2_length': 9, 'avg_sentence2_length': 52.43, 'max_sentence2_length': 159, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'zh': {'num_samples': 1365, 'number_of_characters': 67237, 'min_sentence1_length': 5, 'avg_sentence1_length': 33.41, 'max_sentence1_length': 135, 'unique_sentence1': 798, 'min_sentence2_length': 3, 'avg_sentence2_length': 15.85, 'max_sentence2_length': 66, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}}}} |
| [XNLIV2](https://arxiv.org/pdf/2301.06527) (Upadhyay et al., 2023) | ['asm', 'ben', 'bho', 'ell', 'guj', 'kan', 'mar', 'ory', 'pan', 'rus', 'san', 'tam', 'tur'] | PairClassification | s2s | [Fiction, Government, Non-fiction, Written] | None | None |
@@ -727,1060 +866,1061 @@ The following tables give you an overview of the tasks in MTEB.
-| ISO Code | Language | Family | BitextMining | Classification | Clustering | InstructionRetrieval | MultilabelClassification | PairClassification | Reranking | Retrieval | STS | Speed | Summarization | Sum |
-|---|------|------|------|------|------|------|------|------|------|------|------|---|
-| aai | Arifama-Miniafia | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| aak | Ankave | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| aau | Abau | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| aaz | Amarasi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| abs | Ambonese Malay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| abt | Ambulas | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| abx | Inabaknon | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| aby | Aneme Wake | Yareban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ace | Achinese | Austronesian | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
-| acf | Saint Lucian Creole French | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| acm | Mesopotamian Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 |
-| acq | Ta'izzi-Adeni Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| acr | Achi | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| acu | Achuar-Shiwiar | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| adz | Adzera | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| aeb | Tunisian Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| aer | Eastern Arrernte | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| aey | Amele | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| afr | Afrikaans | Indo-European | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 10 |
-| agd | Agarabi | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| agg | Angor | Senagi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| agm | Angaataha | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| agn | Agutaynen | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| agr | Aguaruna | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| agt | Central Cagayan Agta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| agu | Aguacateco | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| aia | Arosi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| aii | Assyrian Neo-Aramaic | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ajp | South Levantine Arabic | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| aka | Akan | Atlantic-Congo | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
-| ake | Akawaio | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| alp | Alune | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| alq | Algonquin | Algic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| als | Tosk Albanian | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 |
-| aly | Alyawarr | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ame | Yanesha' | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| amf | Hamer-Banna | South Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| amh | Amharic | Afro-Asiatic | 3 | 6 | 3 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 14 |
-| amk | Ambai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| amm | Ama (Papua New Guinea) | Left May | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| amn | Amanab | Border | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| amo | Amo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| amp | Alamblak | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| amr | Amarakaeri | Harakmbut | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| amu | Guerrero Amuzgo | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| amx | Anmatyerre | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ang | Old English (ca. 450-1100) | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| anh | Nend | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| anp | Angika | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| anv | Denya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| aoi | Anindilyakwa | Gunwinyguan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| aoj | Mufian | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| aom | Ömie | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| aon | Bumbita Arapesh | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| apb | Sa'a | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| apc | Levantine Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 |
-| ape | Bukiyip | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| apn | Apinayé | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| apr | Arop-Lokep | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| apu | Apurinã | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| apw | Western Apache | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| apz | Safeyoka | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ara | Arabic | Unclassified | 2 | 12 | 0 | 0 | 0 | 2 | 2 | 9 | 2 | 0 | 0 | 29 |
-| arb | Standard Arabic | Afro-Asiatic | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 8 |
-| are | Western Arrarnta | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| arl | Arabela | Zaparoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| arn | Mapudungun | Araucanian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| arp | Arapaho | Algic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| arq | Algerian Arabic | Afro-Asiatic | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 |
-| ars | Najdi Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 |
-| ary | Moroccan Arabic | Afro-Asiatic | 1 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 7 |
-| arz | Egyptian Arabic | Afro-Asiatic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 |
-| asm | Assamese | Indo-European | 5 | 3 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 14 |
-| aso | Dano | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ast | Asturian | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
-| ata | Pele-Ata | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| atb | Zaiwa | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| atd | Ata Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| atg | Ivbie North-Okpela-Arhe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| att | Pamplona Atta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| auc | Waorani | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| aui | Anuki | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| auy | Awiyaana | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| avt | Au | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| awa | Awadhi | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
-| awb | Awa (Papua New Guinea) | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| awk | Awabakal | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| awx | Awara | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ayr | Central Aymara | Aymaran | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| azb | South Azerbaijani | Turkic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
-| aze | Azerbaijani | Unclassified | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
-| azg | San Pedro Amuzgos Amuzgo | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| azj | North Azerbaijani | Turkic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 |
-| azz | Highland Puebla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bak | Bashkir | Turkic | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
-| bam | Bambara | Mande | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 |
-| ban | Balinese | Austronesian | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
-| bao | Waimaha | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bba | Baatonum | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bbb | Barai | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bbc | Batak Toba | Austronesian | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
-| bbr | Girawa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bch | Bariai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bco | Kaluli | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bdd | Bunama | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bea | Beaver | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bef | Benabena | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bel | Belarusian | Indo-European | 4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
-| bem | Bemba (Zambia) | Atlantic-Congo | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
-| ben | Bengali | Indo-European | 7 | 9 | 2 | 0 | 0 | 1 | 2 | 6 | 1 | 0 | 0 | 28 |
-| beo | Beami | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ber | Berber (Other) | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| beu | Blagar | Timor-Alor-Pantar | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bew | Betawi | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| bgc | Haryanvi | Indo-European | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
-| bgs | Tagabawa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bgt | Bughotu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bhb | Bhili | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bhd | Bhadrawahi | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bhg | Binandere | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bhl | Bimin | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bho | Bhojpuri | Indo-European | 2 | 2 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 |
-| bhp | Bima | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| big | Biangai | Kunimaipan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bjj | Kanauji | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bjk | Barok | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bjn | Banjar | Austronesian | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
-| bjp | Fanamaket | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bjr | Binumarien | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bjv | Bedjond | Central Sudanic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bjz | Baruga | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bkd | Binukid | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bki | Baki | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bkq | Bakairí | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bkx | Baikeno | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| blw | Balangao | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| blz | Balantak | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bmh | Kein | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bmk | Ghayavi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bmr | Muinane | Boran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bmu | Somba-Siawari | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bnp | Bola | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bns | Bundeli | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| boa | Bora | Boran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bod | Tibetan | Sino-Tibetan | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 |
-| boj | Anjam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bon | Bine | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bos | Bosnian | Indo-European | 3 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
-| box | Buamu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| boy | Bodo (Central African Republic) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bpr | Koronadal Blaan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bps | Sarangani Blaan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bqc | Boko (Benin) | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bqp | Busa | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bra | Braj | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bre | Breton | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
-| brx | Bodo (India) | Sino-Tibetan | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
-| bsj | Bangwinji | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bsn | Barasana-Eduria | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bsp | Baga Sitemu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bss | Akoose | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bug | Buginese | Austronesian | 2 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |
-| buk | Bugawac | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bul | Bulgarian | Indo-European | 3 | 4 | 1 | 0 | 1 | 1 | 1 | 2 | 0 | 0 | 0 | 13 |
-| bus | Bokobaru | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bvd | Baeggu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bvr | Burarra | Maningrida | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bxh | Buhutu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| byr | Baruya | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| byx | Qaqet | Baining | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bzd | Bribri | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bzh | Mapos Buang | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| bzj | Belize Kriol English | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| caa | Chortí | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cab | Garifuna | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cac | Chuj | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| caf | Southern Carrier | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cak | Kaqchikel | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cao | Chácobo | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cap | Chipaya | Uru-Chipaya | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| car | Galibi Carib | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cat | Catalan | Indo-European | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 |
-| cav | Cavineña | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cax | Chiquitano | Chiquitano | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cbc | Carapana | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cbi | Chachi | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cbk | Chavacano | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
-| cbr | Cashibo-Cacataibo | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cbs | Cashinahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cbt | Chayahuita | Cahuapanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cbu | Candoshi-Shapra | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cbv | Cacua | Kakua-Nukak | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cco | Comaltepec Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ceb | Cebuano | Austronesian | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 |
-| cek | Eastern Khumi Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ces | Czech | Indo-European | 4 | 5 | 2 | 0 | 1 | 2 | 1 | 2 | 0 | 0 | 0 | 17 |
-| cgc | Kagayanen | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cha | Chamorro | Austronesian | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
-| chd | Highland Oaxaca Chontal | Tequistlatecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| chf | Tabasco Chontal | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| chk | Chuukese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| chq | Quiotepec Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| chv | Chuvash | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| chz | Ozumacín Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cjk | Chokwe | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| cjo | Ashéninka Pajonal | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cjv | Chuave | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ckb | Central Kurdish | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 |
-| cle | Lealao Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| clu | Caluyanun | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cme | Cerma | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cmn | Mandarin Chinese | Sino-Tibetan | 4 | 10 | 4 | 0 | 0 | 3 | 4 | 10 | 9 | 0 | 0 | 44 |
-| cmo | Central Mnong | Austroasiatic | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
-| cni | Asháninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cnl | Lalana Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cnt | Tepetotutla Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| code | unknown | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 37 | 0 | 0 | 0 | 41 |
-| cof | Colorado | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| con | Cofán | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cop | Coptic | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cor | Cornish | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cot | Caquinte | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cpa | Palantla Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cpb | Ucayali-Yurúa Ashéninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cpc | Ajyíninka Apurucayali | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cpu | Pichis Ashéninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cpy | South Ucayali Ashéninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| crh | Crimean Tatar | Turkic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| crn | El Nayar Cora | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| crx | Carrier | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| csb | Kashubian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cso | Sochiapam Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| csy | Siyin Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cta | Tataltepec Chatino | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cth | Thaiphum Chin | Bookkeeping | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ctp | Western Highland Chatino | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ctu | Chol | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cub | Cubeo | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cuc | Usila Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cui | Cuiba | Guahiboan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cuk | San Blas Kuna | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cut | Teutila Cuicatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cux | Tepeuxila Cuicatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cwe | Kwere | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cya | Nopala Chatino | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| cym | Welsh | Indo-European | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 |
-| daa | Dangaléat | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| dad | Marik | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| dah | Gwahatike | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| dan | Danish | Indo-European | 5 | 9 | 2 | 0 | 1 | 0 | 1 | 5 | 0 | 0 | 0 | 23 |
-| ded | Dedua | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| deu | German | Indo-European | 6 | 14 | 7 | 0 | 1 | 7 | 2 | 18 | 4 | 0 | 0 | 59 |
-| dgc | Casiguran Dumagat Agta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| dgr | Dogrib | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| dgz | Daga | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| dhg | Dhangu-Djangu | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| dif | Dieri | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| dik | Southwestern Dinka | Nilotic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
-| div | Dhivehi | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| dji | Djinang | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| djk | Eastern Maroon Creole | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| djr | Djambarrpuyngu | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| dob | Dobu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| doi | Dogri (macrolanguage) | Unclassified | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| dop | Lukpa | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| dov | Dombe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| dsb | Lower Sorbian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| dtp | Kadazan Dusun | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| dwr | Dawro | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| dww | Dawawa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| dwy | Dhuwaya | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| dyu | Dyula | Mande | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| dza | Tunzu | Atlantic-Congo | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| dzo | Dzongkha | Sino-Tibetan | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
-| ebk | Eastern Bontok | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| eko | Koti | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ell | Modern Greek (1453-) | Indo-European | 3 | 6 | 1 | 0 | 1 | 2 | 0 | 3 | 0 | 0 | 0 | 16 |
-| emi | Mussau-Emira | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| emp | Northern Emberá | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| eng | English | Indo-European | 17 | 160 | 18 | 3 | 1 | 13 | 8 | 108 | 13 | 2 | 1 | 344 |
-| enq | Enga | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| epo | Esperanto | Artificial Language | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
-| eri | Ogea | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ese | Ese Ejja | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| esk | Northwest Alaska Inupiatun | Eskimo-Aleut | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| est | Estonian | Uralic | 2 | 2 | 1 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 8 |
-| etr | Edolo | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| eus | Basque | Unclassified | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 |
-| ewe | Ewe | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
-| faa | Fasu | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| fai | Faiwol | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| fao | Faroese | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 7 |
-| far | Fataleka | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| fas | Persian | Indo-European | 4 | 28 | 5 | 0 | 0 | 8 | 2 | 40 | 3 | 0 | 0 | 90 |
-| ffm | Maasina Fulfulde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| fij | Fijian | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
-| fil | Filipino | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| fin | Finnish | Uralic | 3 | 5 | 1 | 0 | 1 | 1 | 2 | 5 | 1 | 0 | 0 | 19 |
-| fon | Fon | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| for | Fore | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| fra | French | Indo-European | 7 | 13 | 8 | 0 | 1 | 6 | 3 | 15 | 4 | 0 | 1 | 58 |
-| fry | Western Frisian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| fuc | Pulaar | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| fue | Borgu Fulfulde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| fuf | Pular | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| fuh | Western Niger Fulfulde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| fur | Friulian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| fuv | Nigerian Fulfulde | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 |
-| gah | Alekano | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gai | Borei | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gam | Kandawo | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gaw | Nobonob | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gaz | West Central Oromo | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 |
-| gbm | Garhwali | Indo-European | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
-| gdn | Umanakaina | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gdr | Wipi | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| geb | Kire | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gfk | Patpatar | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ghs | Guhu-Samane | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gla | Scottish Gaelic | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
-| gle | Irish | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
-| glg | Galician | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
-| glk | Gilaki | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| glv | Manx | Indo-European | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gmv | Gamo | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gng | Ngangam | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gnn | Gumatj | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gnw | Western Bolivian Guaraní | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gof | Gofa | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gom | Goan Konkani | Indo-European | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
-| grc | Ancient Greek (to 1453) | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| grn | Guarani | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 |
-| gsw | Swiss German | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gub | Guajajára | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| guh | Guahibo | Guahiboan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gui | Eastern Bolivian Guaraní | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| guj | Gujarati | Indo-European | 6 | 6 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 18 |
-| gul | Sea Island Creole English | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gum | Guambiano | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gun | Mbyá Guaraní | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| guo | Guayabero | Guahiboan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gup | Gunwinggu | Gunwinyguan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gux | Gourmanchéma | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gvc | Guanano | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gvf | Golin | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gvn | Kuku-Yalanji | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gvs | Gumawana | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gwi | Gwichʼin | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gym | Ngäbere | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| gyr | Guarayu | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| hat | Haitian | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 |
-| hau | Hausa | Afro-Asiatic | 4 | 5 | 3 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 14 |
-| haw | Hawaiian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| hbo | Ancient Hebrew | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| hch | Huichol | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| heb | Hebrew | Afro-Asiatic | 4 | 5 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 11 |
-| heg | Helong | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| hin | Hindi | Indo-European | 9 | 12 | 2 | 0 | 0 | 2 | 2 | 10 | 2 | 0 | 0 | 39 |
-| hix | Hixkaryána | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| hla | Halia | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| hlt | Matu Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| hmn | Hmong | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| hmo | Hiri Motu | Pidgin | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| hne | Chhattisgarhi | Indo-European | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
-| hns | Caribbean Hindustani | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| hop | Hopi | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| hot | Hote | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| hrv | Croatian | Indo-European | 4 | 3 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 10 |
-| hsb | Upper Sorbian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| hto | Minica Huitoto | Huitotoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| hub | Huambisa | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| hui | Huli | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| hun | Hungarian | Uralic | 5 | 3 | 1 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 12 |
-| hus | Huastec | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| huu | Murui Huitoto | Huitotoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| huv | San Mateo Del Mar Huave | Huavean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| hvn | Sabu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| hye | Armenian | Indo-European | 3 | 3 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 9 |
-| ian | Iatmul | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ibo | Igbo | Atlantic-Congo | 3 | 5 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 12 |
-| ido | Ido | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ign | Ignaciano | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ikk | Ika | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ikw | Ikwere | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ile | Interlingue | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ilo | Iloko | Austronesian | 2 | 1 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 |
-| imo | Imbongu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ina | Interlingua (International Auxiliary Language Association) | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| inb | Inga | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ind | Indonesian | Austronesian | 6 | 7 | 1 | 0 | 0 | 1 | 1 | 4 | 1 | 0 | 0 | 21 |
-| ino | Inoke-Yate | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| iou | Tuma-Irumu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ipi | Ipili | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| isl | Icelandic | Indo-European | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 9 |
-| isn | Isanzu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ita | Italian | Indo-European | 5 | 9 | 1 | 0 | 1 | 2 | 1 | 5 | 3 | 0 | 0 | 27 |
-| iws | Sepik Iwam | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ixl | Ixil | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| jac | Popti' | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| jae | Yabem | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| jao | Yanyuwa | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| jav | Javanese | Austronesian | 4 | 7 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 13 |
-| jic | Tol | Jicaquean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| jid | Bu (Kaduna State) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| jiv | Shuar | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| jni | Janji | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| jpn | Japanese | Japonic | 5 | 8 | 3 | 0 | 0 | 2 | 3 | 13 | 2 | 0 | 0 | 36 |
-| jvn | Caribbean Javanese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kab | Kabyle | Afro-Asiatic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
-| kac | Kachin | Sino-Tibetan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 |
-| kam | Kamba (Kenya) | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| kan | Kannada | Dravidian | 6 | 7 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 19 |
-| kaq | Capanahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kas | Kashmiri | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
-| kat | Georgian | Kartvelian | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 10 |
-| kaz | Kazakh | Turkic | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 |
-| kbc | Kadiwéu | Guaicuruan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kbh | Camsá | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kbm | Iwal | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kbp | Kabiyè | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| kbq | Kamano | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kdc | Kutu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kde | Makonde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kdl | Tsikimba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kea | Kabuverdianu | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 |
-| kek | Kekchí | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ken | Kenyang | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kew | West Kewa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kfg | Kudiya | Dravidian | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kfy | Kumaoni | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kgf | Kube | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kgk | Kaiwá | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kgp | Kaingang | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| khk | Halh Mongolian | Mongolic-Khitan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 |
-| khm | Khmer | Austroasiatic | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 |
-| khs | Kasua | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| khz | Keapara | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kik | Kikuyu | Atlantic-Congo | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
-| kin | Kinyarwanda | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 8 |
-| kir | Kirghiz | Turkic | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 |
-| kiw | Northeast Kiwai | Kiwaian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kiz | Kisi | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kje | Kisar | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kjs | East Kewa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kkc | Odoodee | East Strickland | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kkl | Kosarek Yale | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| klt | Nukna | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| klv | Maskelynes | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kmb | Kimbundu | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| kmg | Kâte | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kmh | Kalam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kmk | Limos Kalinga | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kmo | Kwoma | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kmr | Northern Kurdish | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
-| kms | Kamasau | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kmu | Kanite | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| knc | Central Kanuri | Saharan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| kne | Kankanaey | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| knf | Mankanya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| knj | Western Kanjobal | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| knv | Tabo | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kon | Kongo | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| kor | Korean | Koreanic | 4 | 8 | 1 | 0 | 1 | 3 | 1 | 9 | 3 | 0 | 0 | 30 |
-| kos | Kosraean | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kpf | Komba | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kpg | Kapingamarangi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kpj | Karajá | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kpr | Korafe-Yegha | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kpw | Kobon | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kpx | Mountain Koiali | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kqa | Mum | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kqc | Doromu-Koki | Manubaran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kqf | Kakabai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kql | Kyenele | Yuat | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kqw | Kandas | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| krc | Karachay-Balkar | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ksd | Kuanua | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ksj | Uare | Kwalean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ksr | Borong | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ktm | Kurti | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kto | Kuot | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kud | 'Auhelawa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kue | Kuman (Papua New Guinea) | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kup | Kunimaipa | Kunimaipan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kur | Kurdish | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| kvg | Kuni-Boazi | Anim | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kvn | Border Kuna | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kwd | Kwaio | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kwf | Kwara'ae | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kwi | Awa-Cuaiquer | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kwj | Kwanga | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kyc | Kyaka | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kyf | Kouya | Kru | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kyg | Keyagana | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kyq | Kenga | Central Sudanic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kyz | Kayabí | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kze | Kosena | Bookkeeping | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| kzj | Coastal Kadazan | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| lac | Lacandon | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| lao | Lao | Tai-Kadai | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 |
-| lat | Latin | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
-| lav | Latvian | Indo-European | 1 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
-| lbb | Label | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| lbk | Central Bontok | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| lcm | Tungag | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| leu | Kara (Papua New Guinea) | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| lex | Luang | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| lfn | Lingua Franca Nova | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| lgl | Wala | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| lid | Nyindrou | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| lif | Limbu | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| lij | Ligurian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| lim | Limburgan | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| lin | Lingala | Atlantic-Congo | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 |
-| lit | Lithuanian | Indo-European | 4 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 8 |
-| llg | Lole | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| lmo | Lombard | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| ltg | Latgalian | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| ltz | Luxembourgish | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
-| lua | Luba-Lulua | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| lug | Ganda | Atlantic-Congo | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 |
-| luo | Luo (Kenya and Tanzania) | Nilotic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 |
-| lus | Lushai | Sino-Tibetan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| lvs | Standard Latvian | Unclassified | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 |
-| lww | Lewo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| maa | San Jerónimo Tecóatl Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mad | Madurese | Austronesian | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
-| mag | Magahi | Indo-European | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
-| mai | Maithili | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |
-| maj | Jalapa De Díaz Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mak | Makasar | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| mal | Malayalam | Dravidian | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 19 |
-| mam | Mam | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| maq | Chiquihuitlán Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mar | Marathi | Indo-European | 7 | 6 | 2 | 0 | 0 | 1 | 0 | 2 | 2 | 0 | 0 | 20 |
-| mau | Huautla Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mav | Sateré-Mawé | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| max | North Moluccan Malay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| maz | Central Mazahua | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mbb | Western Bukidnon Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mbc | Macushi | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mbh | Mangseng | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mbj | Nadëb | Naduhup | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mbl | Maxakalí | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mbs | Sarangani Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mbt | Matigsalug Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mca | Maca | Mataguayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mcb | Machiguenga | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mcd | Sharanahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mcf | Matsés | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mco | Coatlán Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mcp | Makaa | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mcq | Ese | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mcr | Menya | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mdy | Male (Ethiopia) | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| med | Melpa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mee | Mengen | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mek | Mekeo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| meq | Merey | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| met | Mato | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| meu | Motu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mey | Hassaniyya | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mgc | Morokodo | Central Sudanic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mgh | Makhuwa-Meetto | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mgw | Matumbi | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mhl | Mauwake | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mhr | Eastern Mari | Uralic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mib | Atatláhuca Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mic | Mi'kmaq | Algic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mie | Ocotepec Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mig | San Miguel El Grande Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mih | Chayuco Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mil | Peñoles Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| min | Minangkabau | Austronesian | 3 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 |
-| mio | Pinotepa Nacional Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mir | Isthmus Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mit | Southern Puebla Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| miz | Coatzospan Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mjc | San Juan Colorado Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mkd | Macedonian | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 |
-| mkj | Mokilese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mkl | Mokole | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mkn | Kupang Malay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mks | Silacayoapan Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mle | Manambu | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mlg | Malagasy | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mlh | Mape | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mlp | Bargam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mlt | Maltese | Afro-Asiatic | 2 | 2 | 2 | 0 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 9 |
-| mmo | Mangga Buang | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mmx | Madak | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mna | Mbula | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mni | Manipuri | Sino-Tibetan | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |
-| mon | Mongolian | Unclassified | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
-| mop | Mopán Maya | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mos | Mossi | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| mox | Molima | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mph | Maung | Iwaidjan Proper | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mpj | Martu Wangka | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mpm | Yosondúa Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mpp | Migabac | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mps | Dadibi | Teberan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mpt | Mian | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mpx | Misima-Panaeati | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mqb | Mbuko | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mqj | Mamasa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mri | Maori | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 |
-| msa | Malay (macrolanguage) | Unclassified | 1 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 |
-| msb | Masbatenyo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| msc | Sankaran Maninka | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| msk | Mansaka | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| msm | Agusan Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| msy | Aruamu | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mti | Maiwa (Papua New Guinea) | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mto | Totontepec Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mui | Musi | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| mup | Malvi | Indo-European | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
-| mux | Bo-Ung | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| muy | Muyang | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mva | Manam | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mvn | Minaveha | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mwc | Are | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mwe | Mwera (Chimwera) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mwf | Murrinh-Patha | Southern Daly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mwp | Kala Lagaw Ya | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mwr | Marwari | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mxb | Tezoatlán Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mxp | Tlahuitoltepec Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mxq | Juquila Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mxt | Jamiltepec Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mya | Burmese | Sino-Tibetan | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 9 |
-| myk | Mamara Senoufo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| myu | Mundurukú | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| myw | Muyuw | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| myy | Macuna | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| mzz | Maiadomu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nab | Southern Nambikuára | Nambiquaran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| naf | Nabak | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nak | Nakanai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nas | Naasioi | South Bougainville | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nbl | South Ndebele | Unclassified | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nbq | Nggem | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nca | Iyo | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nch | Central Huasteca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ncj | Northern Puebla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ncl | Michoacán Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ncu | Chumburung | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nde | North Ndebele | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ndg | Ndengereko | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ndj | Ndamba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nds | Low German | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nep | Nepali (macrolanguage) | Unclassified | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| nfa | Dhao | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ngp | Ngulu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ngu | Guerrero Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nhe | Eastern Huasteca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nhg | Tetelcingo Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nhi | Zacatlán-Ahuacatlán-Tepetzintla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nho | Takuu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nhr | Naro | Khoe-Kwadi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nhu | Noone | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nhw | Western Huasteca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nhy | Northern Oaxaca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nif | Nek | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nii | Nii | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nij | Ngaju | Austronesian | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
-| nin | Ninzo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nko | Nkonya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nld | Dutch | Indo-European | 6 | 6 | 1 | 0 | 1 | 1 | 1 | 2 | 2 | 0 | 0 | 20 |
-| nlg | Gela | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nna | Nyangumarta | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nno | Norwegian Nynorsk | Unclassified | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 |
-| nnq | Ngindo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| noa | Woun Meu | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nob | Norwegian Bokmål | Unclassified | 4 | 7 | 5 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 19 |
-| noe | Nimadi | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nop | Numanggang | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nor | Norwegian | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 3 |
-| not | Nomatsiguenga | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nou | Ewage-Notu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nov | Novial | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| npi | Nepali (individual language) | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 |
-| npl | Southeastern Puebla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nqo | N'Ko | Artificial Language | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
-| nsn | Nehan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nso | Pedi | Atlantic-Congo | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 |
-| nss | Nali | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ntj | Ngaanyatjarra | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ntp | Northern Tepehuan | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ntu | Natügu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nus | Nuer | Nilotic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| nuy | Nunggubuyu | Gunwinyguan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nvm | Namiae | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nwi | Southwest Tanna | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nya | Nyanja | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 |
-| nys | Nyungar | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| nyu | Nyungwe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| obo | Obo Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| oci | Occitan (post 1500) | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
-| okv | Orokaiva | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| omw | South Tairora | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ong | Olo | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ons | Ono | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ood | Tohono O'odham | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| opm | Oksapmin | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ori | Oriya (macrolanguage) | Unclassified | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| orm | Oromo | Unclassified | 1 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
-| orv | Old Russian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ory | Odia | Indo-European | 5 | 4 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 15 |
-| ote | Mezquital Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| otm | Eastern Highland Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| otn | Tenango Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| otq | Querétaro Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ots | Estado de México Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| pab | Parecís | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| pad | Paumarí | Arawan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| pag | Pangasinan | Austronesian | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| pah | Tenharim | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| pam | Pampanga | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| pan | Panjabi | Indo-European | 6 | 6 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 18 |
-| pao | Northern Paiute | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| pap | Papiamento | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| pbt | Southern Pashto | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 |
-| pcm | Nigerian Pidgin | Indo-European | 1 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |
-| pes | Iranian Persian | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 |
-| pib | Yine | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| pio | Piapoco | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| pir | Piratapuyo | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| piu | Pintupi-Luritja | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| pjt | Pitjantjatjara | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| pls | San Marcos Tlacoyalco Popoloca | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| plt | Plateau Malagasy | Austronesian | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 |
-| plu | Palikúr | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| pma | Paama | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| pms | Piemontese | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| poe | San Juan Atzingo Popoloca | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| poh | Poqomchi' | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| poi | Highland Popoluca | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| pol | Polish | Indo-European | 4 | 11 | 4 | 0 | 1 | 4 | 0 | 18 | 4 | 0 | 0 | 46 |
-| pon | Pohnpeian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| por | Portuguese | Indo-European | 4 | 9 | 1 | 0 | 2 | 3 | 1 | 5 | 3 | 0 | 0 | 28 |
-| poy | Pogolo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ppo | Folopa | Teberan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| prf | Paranan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| pri | Paicî | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| prs | Dari | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
-| ptp | Patep | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ptu | Bambam | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| pus | Pushto | Unclassified | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
-| pwg | Gapapaiwa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| qub | Huallaga Huánuco Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| quc | K'iche' | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| quf | Lambayeque Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| quh | South Bolivian Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| qul | North Bolivian Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| qup | Southern Pastaza Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| quy | Ayacucho Quechua | Quechuan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| qvc | Cajamarca Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| qve | Eastern Apurímac Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| qvh | Huamalíes-Dos de Mayo Huánuco Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| qvm | Margos-Yarowilca-Lauricocha Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| qvn | North Junín Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| qvs | San Martín Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| qvw | Huaylla Wanca Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| qvz | Northern Pastaza Quichua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| qwh | Huaylas Ancash Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| qxh | Panao Huánuco Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| qxn | Northern Conchucos Ancash Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| qxo | Southern Conchucos Ancash Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| rai | Ramoaaina | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| raj | Rajasthani | Unclassified | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
-| reg | Kara (Tanzania) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| rej | Rejang | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| rgu | Ringgou | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| rkb | Rikbaktsa | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| rmc | Carpathian Romani | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| rmy | Vlax Romani | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| rom | Romany | Unclassified | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
-| ron | Romanian | Indo-European | 5 | 6 | 1 | 0 | 1 | 0 | 1 | 3 | 1 | 0 | 0 | 18 |
-| roo | Rotokas | North Bougainville | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| rop | Kriol | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| row | Dela-Oenale | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| rro | Waima | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ruf | Luguru | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| rug | Roviana | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| run | Rundi | Atlantic-Congo | 1 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
-| rus | Russian | Indo-European | 5 | 13 | 6 | 0 | 2 | 4 | 2 | 16 | 4 | 0 | 0 | 52 |
-| rwo | Rawa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| sab | Buglere | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| sag | Sango | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| sah | Yakut | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| san | Sanskrit | Indo-European | 5 | 3 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 10 |
-| sat | Santali | Austroasiatic | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |
-| sbe | Saliba | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| sbk | Safwa | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| sbs | Subiya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| scn | Sicilian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| sco | Scots | Indo-European | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| seh | Sena | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| sey | Secoya | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| sgb | Mag-antsi Ayta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| sgz | Sursurunga | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| shi | Tachelhit | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| shj | Shatt | Dajuic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| shn | Shan | Tai-Kadai | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 |
-| shp | Shipibo-Conibo | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| sim | Mende (Papua New Guinea) | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| sin | Sinhala | Indo-European | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 |
-| sja | Epena | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| slk | Slovak | Indo-European | 3 | 4 | 1 | 0 | 1 | 0 | 0 | 3 | 0 | 0 | 0 | 12 |
-| sll | Salt-Yui | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| slv | Slovenian | Indo-European | 3 | 4 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 10 |
-| smk | Bolinao | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| smo | Samoan | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
-| sna | Shona | Atlantic-Congo | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 |
-| snc | Sinaugoro | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| snd | Sindhi | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 |
-| snn | Siona | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| snp | Siane | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| snx | Sam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| sny | Saniyo-Hiyewe | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| som | Somali | Afro-Asiatic | 3 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 9 |
-| soq | Kanasi | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| sot | Southern Sotho | Atlantic-Congo | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 |
-| soy | Miyobe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| spa | Spanish | Indo-European | 4 | 13 | 4 | 0 | 1 | 3 | 2 | 13 | 4 | 0 | 0 | 44 |
-| spl | Selepet | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| spm | Akukem | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| spp | Supyire Senoufo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| sps | Saposa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| spy | Sabaot | Nilotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| sqi | Albanian | Unclassified | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
-| srd | Sardinian | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| sri | Siriano | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| srm | Saramaccan | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| srn | Sranan Tongo | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
-| srp | Serbian | Indo-European | 4 | 1 | 1 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 9 |
-| srq | Sirionó | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ssd | Siroi | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ssg | Seimat | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ssw | Swati | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 |
-| ssx | Samberigi | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| stp | Southeastern Tepehuan | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| sua | Sulka | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| sue | Suena | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| sun | Sundanese | Austronesian | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 9 |
-| sus | Susu | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| suz | Sunwar | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| svk | Slovakian Sign Language | Sign Language | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| swa | Swahili (macrolanguage) | Atlantic-Congo | 1 | 7 | 2 | 0 | 0 | 1 | 1 | 3 | 0 | 0 | 0 | 15 |
-| swe | Swedish | Indo-European | 4 | 8 | 3 | 0 | 1 | 1 | 1 | 4 | 0 | 0 | 0 | 22 |
-| swg | Swabian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| swh | Swahili (individual language) | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 |
-| swp | Suau | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| sxb | Suba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| szl | Silesian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| tac | Lowland Tarahumara | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tah | Tahitian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| taj | Eastern Tamang | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tam | Tamil | Dravidian | 7 | 7 | 2 | 0 | 0 | 1 | 0 | 3 | 1 | 0 | 0 | 21 |
-| taq | Tamasheq | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| tat | Tatar | Turkic | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
-| tav | Tatuyo | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| taw | Tai | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tbc | Takia | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tbf | Mandara | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tbg | North Tairora | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tbo | Tawala | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tbz | Ditammari | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tca | Ticuna | Ticuna-Yuri | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tcs | Torres Strait Creole | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tcz | Thado Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tdt | Tetun Dili | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tee | Huehuetla Tepehua | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tel | Telugu | Dravidian | 7 | 7 | 2 | 0 | 0 | 0 | 1 | 5 | 2 | 0 | 0 | 24 |
-| ter | Tereno | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tet | Tetum | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tew | Tewa (USA) | Kiowa-Tanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tfr | Teribe | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tgk | Tajik | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 |
-| tgl | Tagalog | Austronesian | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 |
-| tgo | Sudest | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tgp | Tangoa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tha | Thai | Tai-Kadai | 4 | 8 | 1 | 0 | 0 | 1 | 1 | 6 | 0 | 0 | 0 | 21 |
-| tif | Tifal | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tim | Timbe | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tir | Tigrinya | Afro-Asiatic | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 |
-| tiw | Tiwi | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tiy | Tiruray | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tke | Takwane | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tku | Upper Necaxa Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tlf | Telefol | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tmd | Haruai | Piawi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tna | Tacana | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tnc | Tanimuca-Retuarã | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tnk | Kwamera | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tnn | North Tanna | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tnp | Whitesands | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| toc | Coyutla Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tod | Toma | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tof | Gizrra | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| toj | Tojolabal | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ton | Tonga (Tonga Islands) | Austronesian | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
-| too | Xicotepec De Juárez Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| top | Papantla Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tos | Highland Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tpa | Taupota | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tpi | Tok Pisin | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
-| tpt | Tlachichilco Tepehua | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tpz | Tinputz | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| trc | Copala Triqui | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tsn | Tswana | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 |
-| tso | Tsonga | Atlantic-Congo | 1 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 |
-| tsw | Tsishingini | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ttc | Tektiteko | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tte | Bwanabwana | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tuc | Mutu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tue | Tuyuca | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tuf | Central Tunebo | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tuk | Turkmen | Turkic | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
-| tum | Tumbuka | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| tuo | Tucano | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tur | Turkish | Turkic | 4 | 7 | 1 | 0 | 0 | 3 | 0 | 3 | 2 | 0 | 0 | 20 |
-| tvk | Southeast Ambrym | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| twi | Twi | Unclassified | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
-| txq | Tii | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| txu | Kayapó | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tyv | Tuvinian | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tzj | Tz'utujil | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tzl | Talossan | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| tzm | Central Atlas Tamazight | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| tzo | Tzotzil | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ubr | Ubir | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ubu | Umbu-Ungu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| udu | Uduk | Koman | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| uig | Uighur | Turkic | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |
-| ukr | Ukrainian | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 |
-| uli | Ulithian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ulk | Meriam Mir | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| umb | Umbundu | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| upv | Uripiv-Wala-Rano-Atchin | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ura | Urarina | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| urb | Urubú-Kaapor | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| urd | Urdu | Indo-European | 7 | 8 | 2 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 19 |
-| uri | Urim | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| urt | Urat | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| urw | Sop | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| usa | Usarufa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| usp | Uspanteco | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| uvh | Uri | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| uvl | Lote | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| uzb | Uzbek | Unclassified | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
-| uzn | Northern Uzbek | Turkic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 |
-| vec | Venetian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| ven | Venda | Atlantic-Congo | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
-| vid | Vidunda | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| vie | Vietnamese | Austroasiatic | 5 | 6 | 1 | 0 | 0 | 1 | 0 | 5 | 0 | 0 | 0 | 18 |
-| viv | Iduna | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| vmy | Ayautla Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| waj | Waffa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| wal | Wolaytta | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| wap | Wapishana | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| war | Waray (Philippines) | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 |
-| wat | Kaninuwa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| wbi | Vwanji | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| wbp | Warlpiri | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| wed | Wedau | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| wer | Weri | Kunimaipan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| wim | Wik-Mungkan | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| wiu | Wiru | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| wiv | Vitu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| wln | Walloon | Indo-European | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| wmt | Walmajarri | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| wmw | Mwani | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| wnc | Wantoat | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| wnu | Usan | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| wol | Wolof | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 |
-| wos | Hanga Hundi | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| wrk | Garrwa | Garrwan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| wro | Worrorra | Worrorran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| wrs | Waris | Border | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| wsk | Waskia | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| wuu | Wu Chinese | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| wuv | Wuvulu-Aua | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| xav | Xavánte | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| xbi | Kombio | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| xed | Hdi | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| xho | Xhosa | Atlantic-Congo | 3 | 3 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 10 |
-| xla | Kamula | Kamula-Elevala | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| xnn | Northern Kankanay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| xon | Konkomba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| xsi | Sio | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| xtd | Diuxi-Tilantongo Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| xtm | Magdalena Peñasco Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| yaa | Yaminahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| yad | Yagua | Peba-Yagua | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| yal | Yalunka | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| yap | Yapese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| yaq | Yaqui | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| yby | Yaweyuha | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ycn | Yucuna | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ydd | Eastern Yiddish | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
-| yid | Yiddish | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| yka | Yakan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| yle | Yele | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| yml | Iamalele | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| yon | Yongkom | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| yor | Yoruba | Atlantic-Congo | 4 | 5 | 3 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 16 |
-| yrb | Yareba | Yareban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| yre | Yaouré | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| yss | Yessan-Mayo | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| yue | Yue Chinese | Sino-Tibetan | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
-| yuj | Karkar-Yuri | Pauwasi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| yut | Yopno | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| yuw | Yau (Morobe Province) | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| yva | Yawa | Yawa-Saweru | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zaa | Sierra de Juárez Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zab | Western Tlacolula Valley Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zac | Ocotlán Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zad | Cajonos Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zai | Isthmus Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zaj | Zaramo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zam | Miahuatlán Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zao | Ozolotepec Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zap | Zapotec | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zar | Rincón Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zas | Santo Domingo Albarradas Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zat | Tabaa Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zav | Yatzachi Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zaw | Mitla Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zca | Coatecas Altas Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zga | Kinga | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zho | Chinese | Unclassified | 2 | 2 | 1 | 0 | 0 | 2 | 1 | 13 | 0 | 0 | 0 | 21 |
-| zia | Zia | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ziw | Zigula | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zlm | Malay (individual language) | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zos | Francisco León Zoque | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zpc | Choapan Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zpl | Lachixío Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zpm | Mixtepec Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zpo | Amatlán Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zpq | Zoogocho Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zpu | Yalálag Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zpv | Chichicapan Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zpz | Texmelucan Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zsm | Standard Malay | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 |
-| zsr | Southern Rincon Zapotec | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| ztq | Quioquitani-Quierí Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zty | Yatee Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| zul | Zulu | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 |
-| zyp | Zyphe Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| Total | None | None | None | 1398 | 836 | 311 | 3 | 28 | 91 | 55 | 507 | 88 | 2 | 2 |
+| ISO Code | Language | Family | Any2AnyMultiChoice | Any2AnyRetrieval | Any2TextMutipleChoice | BitextMining | Classification | Clustering | ImageClassification | ImageClustering | ImageMultilabelClassification | ImageTextPairClassification | InstructionRetrieval | MultilabelClassification | PairClassification | Reranking | Retrieval | STS | Speed | Summarization | VisualSTS | ZeroShotClassification | Sum |
+|---|------|------|------|------|------|------|------|------|------|------|------|------|------|------|------|------|------|------|------|------|---|
+| aai | Arifama-Miniafia | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| aak | Ankave | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| aau | Abau | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| aaz | Amarasi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| abs | Ambonese Malay | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| abt | Ambulas | Ndu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| abx | Inabaknon | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| aby | Aneme Wake | Yareban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ace | Achinese | Austronesian | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
+| acf | Saint Lucian Creole French | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| acm | Mesopotamian Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 |
+| acq | Ta'izzi-Adeni Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| acr | Achi | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| acu | Achuar-Shiwiar | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| adz | Adzera | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| aeb | Tunisian Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| aer | Eastern Arrernte | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| aey | Amele | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| afr | Afrikaans | Indo-European | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 10 |
+| agd | Agarabi | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| agg | Angor | Senagi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| agm | Angaataha | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| agn | Agutaynen | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| agr | Aguaruna | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| agt | Central Cagayan Agta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| agu | Aguacateco | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| aia | Arosi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| aii | Assyrian Neo-Aramaic | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ajp | South Levantine Arabic | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| aka | Akan | Atlantic-Congo | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| ake | Akawaio | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| alp | Alune | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| alq | Algonquin | Algic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| als | Tosk Albanian | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 5 |
+| aly | Alyawarr | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ame | Yanesha' | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| amf | Hamer-Banna | South Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| amh | Amharic | Afro-Asiatic | 0 | 0 | 0 | 3 | 6 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 14 |
+| amk | Ambai | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| amm | Ama (Papua New Guinea) | Left May | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| amn | Amanab | Border | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| amo | Amo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| amp | Alamblak | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| amr | Amarakaeri | Harakmbut | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| amu | Guerrero Amuzgo | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| amx | Anmatyerre | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ang | Old English (ca. 450-1100) | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| anh | Nend | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| anp | Angika | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| anv | Denya | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| aoi | Anindilyakwa | Gunwinyguan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| aoj | Mufian | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| aom | Ömie | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| aon | Bumbita Arapesh | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| apb | Sa'a | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| apc | Levantine Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 |
+| ape | Bukiyip | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| apn | Apinayé | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| apr | Arop-Lokep | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| apu | Apurinã | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| apw | Western Apache | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| apz | Safeyoka | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ara | Arabic | Unclassified | 0 | 2 | 0 | 2 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 9 | 2 | 0 | 0 | 1 | 0 | 32 |
+| arb | Standard Arabic | Afro-Asiatic | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 8 |
+| are | Western Arrarnta | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| arl | Arabela | Zaparoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| arn | Mapudungun | Araucanian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| arp | Arapaho | Algic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| arq | Algerian Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 |
+| ars | Najdi Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 |
+| ary | Moroccan Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 7 |
+| arz | Egyptian Arabic | Afro-Asiatic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 5 |
+| asm | Assamese | Indo-European | 0 | 0 | 0 | 5 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 14 |
+| aso | Dano | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ast | Asturian | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| ata | Pele-Ata | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| atb | Zaiwa | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| atd | Ata Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| atg | Ivbie North-Okpela-Arhe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| att | Pamplona Atta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| auc | Waorani | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| aui | Anuki | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| auy | Awiyaana | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| avt | Au | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| awa | Awadhi | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
+| awb | Awa (Papua New Guinea) | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| awk | Awabakal | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| awx | Awara | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ayr | Central Aymara | Aymaran | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| azb | South Azerbaijani | Turkic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| aze | Azerbaijani | Unclassified | 0 | 0 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| azg | San Pedro Amuzgos Amuzgo | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| azj | North Azerbaijani | Turkic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 |
+| azz | Highland Puebla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bak | Bashkir | Turkic | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
+| bam | Bambara | Mande | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 5 |
+| ban | Balinese | Austronesian | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
+| bao | Waimaha | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bba | Baatonum | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bbb | Barai | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bbc | Batak Toba | Austronesian | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
+| bbr | Girawa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bch | Bariai | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bco | Kaluli | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bdd | Bunama | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bea | Beaver | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bef | Benabena | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bel | Belarusian | Indo-European | 0 | 0 | 0 | 4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
+| bem | Bemba (Zambia) | Atlantic-Congo | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| ben | Bengali | Indo-European | 0 | 1 | 0 | 7 | 9 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 6 | 1 | 0 | 0 | 0 | 0 | 29 |
+| beo | Beami | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ber | Berber (Other) | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| beu | Blagar | Timor-Alor-Pantar | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bew | Betawi | Austronesian | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| bgc | Haryanvi | Indo-European | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
+| bgs | Tagabawa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bgt | Bughotu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bhb | Bhili | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bhd | Bhadrawahi | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bhg | Binandere | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bhl | Bimin | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bho | Bhojpuri | Indo-European | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
+| bhp | Bima | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| big | Biangai | Kunimaipan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bjj | Kanauji | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bjk | Barok | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bjn | Banjar | Austronesian | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
+| bjp | Fanamaket | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bjr | Binumarien | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bjv | Bedjond | Central Sudanic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bjz | Baruga | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bkd | Binukid | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bki | Baki | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bkq | Bakairí | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bkx | Baikeno | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| blw | Balangao | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| blz | Balantak | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bmh | Kein | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bmk | Ghayavi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bmr | Muinane | Boran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bmu | Somba-Siawari | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bnp | Bola | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bns | Bundeli | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| boa | Bora | Boran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bod | Tibetan | Sino-Tibetan | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 |
+| boj | Anjam | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bon | Bine | Eastern Trans-Fly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bos | Bosnian | Indo-European | 0 | 0 | 0 | 3 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
+| box | Buamu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| boy | Bodo (Central African Republic) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bpr | Koronadal Blaan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bps | Sarangani Blaan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bqc | Boko (Benin) | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bqp | Busa | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bra | Braj | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bre | Breton | Indo-European | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
+| brx | Bodo (India) | Sino-Tibetan | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
+| bsj | Bangwinji | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bsn | Barasana-Eduria | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bsp | Baga Sitemu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bss | Akoose | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bug | Buginese | Austronesian | 0 | 0 | 0 | 2 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |
+| buk | Bugawac | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bul | Bulgarian | Indo-European | 0 | 1 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 14 |
+| bus | Bokobaru | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bvd | Baeggu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bvr | Burarra | Maningrida | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bxh | Buhutu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| byr | Baruya | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| byx | Qaqet | Baining | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bzd | Bribri | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bzh | Mapos Buang | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| bzj | Belize Kriol English | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| caa | Chortí | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cab | Garifuna | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cac | Chuj | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| caf | Southern Carrier | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cak | Kaqchikel | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cao | Chácobo | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cap | Chipaya | Uru-Chipaya | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| car | Galibi Carib | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cat | Catalan | Indo-European | 0 | 0 | 0 | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 |
+| cav | Cavineña | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cax | Chiquitano | Chiquitano | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cbc | Carapana | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cbi | Chachi | Barbacoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cbk | Chavacano | Indo-European | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
+| cbr | Cashibo-Cacataibo | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cbs | Cashinahua | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cbt | Chayahuita | Cahuapanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cbu | Candoshi-Shapra | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cbv | Cacua | Kakua-Nukak | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cco | Comaltepec Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ceb | Cebuano | Austronesian | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 |
+| cek | Eastern Khumi Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ces | Czech | Indo-European | 0 | 1 | 0 | 4 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 18 |
+| cgc | Kagayanen | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cha | Chamorro | Austronesian | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
+| chd | Highland Oaxaca Chontal | Tequistlatecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| chf | Tabasco Chontal | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| chk | Chuukese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| chq | Quiotepec Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| chv | Chuvash | Turkic | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| chz | Ozumacín Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cjk | Chokwe | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| cjo | Ashéninka Pajonal | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cjv | Chuave | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ckb | Central Kurdish | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 |
+| cle | Lealao Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| clu | Caluyanun | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cme | Cerma | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cmn | Mandarin Chinese | Sino-Tibetan | 0 | 0 | 0 | 4 | 10 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 4 | 10 | 9 | 0 | 0 | 1 | 0 | 45 |
+| cmo | Central Mnong | Austroasiatic | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
+| cni | Asháninka | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cnl | Lalana Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cnt | Tepetotutla Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| code | unknown | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 37 | 0 | 0 | 0 | 0 | 0 | 41 |
+| cof | Colorado | Barbacoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| con | Cofán | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cop | Coptic | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cor | Cornish | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cot | Caquinte | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cpa | Palantla Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cpb | Ucayali-Yurúa Ashéninka | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cpc | Ajyíninka Apurucayali | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cpu | Pichis Ashéninka | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cpy | South Ucayali Ashéninka | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| crh | Crimean Tatar | Turkic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| crn | El Nayar Cora | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| crx | Carrier | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| csb | Kashubian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cso | Sochiapam Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| csy | Siyin Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cta | Tataltepec Chatino | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cth | Thaiphum Chin | Bookkeeping | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ctp | Western Highland Chatino | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ctu | Chol | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cub | Cubeo | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cuc | Usila Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cui | Cuiba | Guahiboan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cuk | San Blas Kuna | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cut | Teutila Cuicatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cux | Tepeuxila Cuicatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cwe | Kwere | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cya | Nopala Chatino | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| cym | Welsh | Indo-European | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 |
+| daa | Dangaléat | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| dad | Marik | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| dah | Gwahatike | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| dan | Danish | Indo-European | 0 | 2 | 0 | 5 | 9 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 5 | 0 | 0 | 0 | 0 | 0 | 25 |
+| ded | Dedua | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| deu | German | Indo-European | 0 | 2 | 0 | 6 | 14 | 7 | 0 | 0 | 0 | 0 | 0 | 1 | 7 | 2 | 18 | 4 | 0 | 0 | 2 | 0 | 63 |
+| dgc | Casiguran Dumagat Agta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| dgr | Dogrib | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| dgz | Daga | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| dhg | Dhangu-Djangu | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| dif | Dieri | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| dik | Southwestern Dinka | Nilotic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| div | Dhivehi | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| dji | Djinang | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| djk | Eastern Maroon Creole | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| djr | Djambarrpuyngu | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| dob | Dobu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| doi | Dogri (macrolanguage) | Unclassified | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| dop | Lukpa | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| dov | Dombe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| dsb | Lower Sorbian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| dtp | Kadazan Dusun | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| dwr | Dawro | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| dww | Dawawa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| dwy | Dhuwaya | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| dyu | Dyula | Mande | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| dza | Tunzu | Atlantic-Congo | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| dzo | Dzongkha | Sino-Tibetan | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| ebk | Eastern Bontok | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| eko | Koti | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ell | Modern Greek (1453-) | Indo-European | 0 | 2 | 0 | 3 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 18 |
+| emi | Mussau-Emira | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| emp | Northern Emberá | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| eng | English | Indo-European | 9 | 62 | 4 | 17 | 160 | 18 | 21 | 5 | 1 | 6 | 3 | 1 | 13 | 8 | 108 | 13 | 2 | 1 | 7 | 24 | 483 |
+| enq | Enga | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| epo | Esperanto | Artificial Language | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
+| eri | Ogea | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ese | Ese Ejja | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| esk | Northwest Alaska Inupiatun | Eskimo-Aleut | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| est | Estonian | Uralic | 0 | 1 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 9 |
+| etr | Edolo | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| eus | Basque | Unclassified | 0 | 0 | 0 | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 |
+| ewe | Ewe | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
+| faa | Fasu | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| fai | Faiwol | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| fao | Faroese | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 7 |
+| far | Fataleka | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| fas | Persian | Indo-European | 0 | 1 | 0 | 4 | 28 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 2 | 40 | 3 | 0 | 0 | 0 | 0 | 91 |
+| ffm | Maasina Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| fij | Fijian | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| fil | Filipino | Austronesian | 0 | 1 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| fin | Finnish | Uralic | 0 | 1 | 0 | 3 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 2 | 5 | 1 | 0 | 0 | 0 | 0 | 20 |
+| fon | Fon | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| for | Fore | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| fra | French | Indo-European | 0 | 1 | 0 | 7 | 13 | 8 | 0 | 0 | 0 | 0 | 0 | 1 | 6 | 3 | 15 | 4 | 0 | 1 | 2 | 0 | 61 |
+| fry | Western Frisian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| fuc | Pulaar | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| fue | Borgu Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| fuf | Pular | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| fuh | Western Niger Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| fur | Friulian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| fuv | Nigerian Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 |
+| gah | Alekano | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gai | Borei | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gam | Kandawo | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gaw | Nobonob | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gaz | West Central Oromo | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 |
+| gbm | Garhwali | Indo-European | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
+| gdn | Umanakaina | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gdr | Wipi | Eastern Trans-Fly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| geb | Kire | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gfk | Patpatar | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ghs | Guhu-Samane | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gla | Scottish Gaelic | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| gle | Irish | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
+| glg | Galician | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
+| glk | Gilaki | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| glv | Manx | Indo-European | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gmv | Gamo | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gng | Ngangam | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gnn | Gumatj | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gnw | Western Bolivian Guaraní | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gof | Gofa | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gom | Goan Konkani | Indo-European | 0 | 0 | 0 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| grc | Ancient Greek (to 1453) | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| grn | Guarani | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 |
+| gsw | Swiss German | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gub | Guajajára | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| guh | Guahibo | Guahiboan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gui | Eastern Bolivian Guaraní | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| guj | Gujarati | Indo-European | 0 | 0 | 0 | 6 | 6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 18 |
+| gul | Sea Island Creole English | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gum | Guambiano | Barbacoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gun | Mbyá Guaraní | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| guo | Guayabero | Guahiboan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gup | Gunwinggu | Gunwinyguan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gux | Gourmanchéma | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gvc | Guanano | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gvf | Golin | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gvn | Kuku-Yalanji | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gvs | Gumawana | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gwi | Gwichʼin | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gym | Ngäbere | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| gyr | Guarayu | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| hat | Haitian | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 5 |
+| hau | Hausa | Afro-Asiatic | 0 | 0 | 0 | 4 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 14 |
+| haw | Hawaiian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| hbo | Ancient Hebrew | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| hch | Huichol | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| heb | Hebrew | Afro-Asiatic | 0 | 1 | 0 | 4 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 12 |
+| heg | Helong | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| hin | Hindi | Indo-European | 0 | 1 | 0 | 9 | 12 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 10 | 2 | 0 | 0 | 0 | 0 | 40 |
+| hix | Hixkaryána | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| hla | Halia | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| hlt | Matu Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| hmn | Hmong | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| hmo | Hiri Motu | Pidgin | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| hne | Chhattisgarhi | Indo-European | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
+| hns | Caribbean Hindustani | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| hop | Hopi | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| hot | Hote | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| hrv | Croatian | Indo-European | 0 | 1 | 0 | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 11 |
+| hsb | Upper Sorbian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| hto | Minica Huitoto | Huitotoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| hub | Huambisa | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| hui | Huli | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| hun | Hungarian | Uralic | 0 | 1 | 0 | 5 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 13 |
+| hus | Huastec | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| huu | Murui Huitoto | Huitotoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| huv | San Mateo Del Mar Huave | Huavean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| hvn | Sabu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| hye | Armenian | Indo-European | 0 | 0 | 0 | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 9 |
+| ian | Iatmul | Ndu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ibo | Igbo | Atlantic-Congo | 0 | 0 | 0 | 3 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 12 |
+| ido | Ido | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ign | Ignaciano | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ikk | Ika | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ikw | Ikwere | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ile | Interlingue | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ilo | Iloko | Austronesian | 0 | 0 | 0 | 2 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 |
+| imo | Imbongu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ina | Interlingua (International Auxiliary Language Association) | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| inb | Inga | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ind | Indonesian | Austronesian | 0 | 3 | 0 | 6 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 4 | 1 | 0 | 0 | 0 | 0 | 24 |
+| ino | Inoke-Yate | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| iou | Tuma-Irumu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ipi | Ipili | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| isl | Icelandic | Indo-European | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 9 |
+| isn | Isanzu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ita | Italian | Indo-European | 0 | 1 | 0 | 5 | 9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | 5 | 3 | 0 | 0 | 2 | 0 | 30 |
+| iws | Sepik Iwam | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ixl | Ixil | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| jac | Popti' | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| jae | Yabem | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| jao | Yanyuwa | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| jav | Javanese | Austronesian | 0 | 0 | 0 | 4 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 13 |
+| jic | Tol | Jicaquean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| jid | Bu (Kaduna State) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| jiv | Shuar | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| jni | Janji | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| jpn | Japanese | Japonic | 0 | 3 | 0 | 5 | 8 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 13 | 2 | 0 | 0 | 0 | 0 | 39 |
+| jvn | Caribbean Javanese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kab | Kabyle | Afro-Asiatic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| kac | Kachin | Sino-Tibetan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 |
+| kam | Kamba (Kenya) | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| kan | Kannada | Dravidian | 0 | 0 | 0 | 6 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 19 |
+| kaq | Capanahua | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kas | Kashmiri | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
+| kat | Georgian | Kartvelian | 0 | 0 | 0 | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 10 |
+| kaz | Kazakh | Turkic | 0 | 0 | 0 | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 |
+| kbc | Kadiwéu | Guaicuruan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kbh | Camsá | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kbm | Iwal | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kbp | Kabiyè | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| kbq | Kamano | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kdc | Kutu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kde | Makonde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kdl | Tsikimba | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kea | Kabuverdianu | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 |
+| kek | Kekchí | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ken | Kenyang | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kew | West Kewa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kfg | Kudiya | Dravidian | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kfy | Kumaoni | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kgf | Kube | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kgk | Kaiwá | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kgp | Kaingang | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| khk | Halh Mongolian | Mongolic-Khitan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 |
+| khm | Khmer | Austroasiatic | 0 | 0 | 0 | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 |
+| khs | Kasua | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| khz | Keapara | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kik | Kikuyu | Atlantic-Congo | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| kin | Kinyarwanda | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 8 |
+| kir | Kirghiz | Turkic | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 7 |
+| kiw | Northeast Kiwai | Kiwaian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kiz | Kisi | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kje | Kisar | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kjs | East Kewa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kkc | Odoodee | East Strickland | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kkl | Kosarek Yale | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| klt | Nukna | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| klv | Maskelynes | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kmb | Kimbundu | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| kmg | Kâte | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kmh | Kalam | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kmk | Limos Kalinga | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kmo | Kwoma | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kmr | Northern Kurdish | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| kms | Kamasau | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kmu | Kanite | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| knc | Central Kanuri | Saharan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| kne | Kankanaey | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| knf | Mankanya | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| knj | Western Kanjobal | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| knv | Tabo | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kon | Kongo | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| kor | Korean | Koreanic | 0 | 2 | 0 | 4 | 8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 1 | 9 | 3 | 0 | 0 | 1 | 0 | 33 |
+| kos | Kosraean | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kpf | Komba | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kpg | Kapingamarangi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kpj | Karajá | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kpr | Korafe-Yegha | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kpw | Kobon | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kpx | Mountain Koiali | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kqa | Mum | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kqc | Doromu-Koki | Manubaran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kqf | Kakabai | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kql | Kyenele | Yuat | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kqw | Kandas | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| krc | Karachay-Balkar | Turkic | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ksd | Kuanua | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ksj | Uare | Kwalean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ksr | Borong | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ktm | Kurti | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kto | Kuot | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kud | 'Auhelawa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kue | Kuman (Papua New Guinea) | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kup | Kunimaipa | Kunimaipan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kur | Kurdish | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| kvg | Kuni-Boazi | Anim | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kvn | Border Kuna | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kwd | Kwaio | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kwf | Kwara'ae | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kwi | Awa-Cuaiquer | Barbacoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kwj | Kwanga | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kyc | Kyaka | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kyf | Kouya | Kru | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kyg | Keyagana | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kyq | Kenga | Central Sudanic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kyz | Kayabí | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kze | Kosena | Bookkeeping | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| kzj | Coastal Kadazan | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| lac | Lacandon | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| lao | Lao | Tai-Kadai | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 5 |
+| lat | Latin | Indo-European | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
+| lav | Latvian | Indo-European | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
+| lbb | Label | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| lbk | Central Bontok | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| lcm | Tungag | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| leu | Kara (Papua New Guinea) | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| lex | Luang | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| lfn | Lingua Franca Nova | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| lgl | Wala | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| lid | Nyindrou | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| lif | Limbu | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| lij | Ligurian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| lim | Limburgan | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| lin | Lingala | Atlantic-Congo | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 |
+| lit | Lithuanian | Indo-European | 0 | 0 | 0 | 4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 |
+| llg | Lole | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| lmo | Lombard | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| ltg | Latgalian | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| ltz | Luxembourgish | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| lua | Luba-Lulua | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| lug | Ganda | Atlantic-Congo | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 |
+| luo | Luo (Kenya and Tanzania) | Nilotic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 5 |
+| lus | Lushai | Sino-Tibetan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| lvs | Standard Latvian | Unclassified | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 5 |
+| lww | Lewo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| maa | San Jerónimo Tecóatl Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mad | Madurese | Austronesian | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
+| mag | Magahi | Indo-European | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| mai | Maithili | Indo-European | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |
+| maj | Jalapa De Díaz Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mak | Makasar | Austronesian | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| mal | Malayalam | Dravidian | 0 | 0 | 0 | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 19 |
+| mam | Mam | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| maq | Chiquihuitlán Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mar | Marathi | Indo-European | 0 | 0 | 0 | 7 | 6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 20 |
+| mau | Huautla Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mav | Sateré-Mawé | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| max | North Moluccan Malay | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| maz | Central Mazahua | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mbb | Western Bukidnon Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mbc | Macushi | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mbh | Mangseng | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mbj | Nadëb | Naduhup | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mbl | Maxakalí | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mbs | Sarangani Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mbt | Matigsalug Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mca | Maca | Mataguayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mcb | Machiguenga | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mcd | Sharanahua | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mcf | Matsés | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mco | Coatlán Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mcp | Makaa | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mcq | Ese | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mcr | Menya | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mdy | Male (Ethiopia) | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| med | Melpa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mee | Mengen | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mek | Mekeo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| meq | Merey | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| met | Mato | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| meu | Motu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mey | Hassaniyya | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mgc | Morokodo | Central Sudanic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mgh | Makhuwa-Meetto | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mgw | Matumbi | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mhl | Mauwake | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mhr | Eastern Mari | Uralic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mib | Atatláhuca Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mic | Mi'kmaq | Algic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mie | Ocotepec Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mig | San Miguel El Grande Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mih | Chayuco Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mil | Peñoles Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| min | Minangkabau | Austronesian | 0 | 0 | 0 | 3 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 |
+| mio | Pinotepa Nacional Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mir | Isthmus Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mit | Southern Puebla Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| miz | Coatzospan Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mjc | San Juan Colorado Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mkd | Macedonian | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 7 |
+| mkj | Mokilese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mkl | Mokole | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mkn | Kupang Malay | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mks | Silacayoapan Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mle | Manambu | Ndu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mlg | Malagasy | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mlh | Mape | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mlp | Bargam | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mlt | Maltese | Afro-Asiatic | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 9 |
+| mmo | Mangga Buang | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mmx | Madak | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mna | Mbula | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mni | Manipuri | Sino-Tibetan | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |
+| mon | Mongolian | Unclassified | 0 | 0 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| mop | Mopán Maya | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mos | Mossi | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| mox | Molima | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mph | Maung | Iwaidjan Proper | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mpj | Martu Wangka | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mpm | Yosondúa Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mpp | Migabac | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mps | Dadibi | Teberan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mpt | Mian | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mpx | Misima-Panaeati | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mqb | Mbuko | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mqj | Mamasa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mri | Maori | Austronesian | 0 | 1 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 |
+| msa | Malay (macrolanguage) | Unclassified | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| msb | Masbatenyo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| msc | Sankaran Maninka | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| msk | Mansaka | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| msm | Agusan Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| msy | Aruamu | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mti | Maiwa (Papua New Guinea) | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mto | Totontepec Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mui | Musi | Austronesian | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| mup | Malvi | Indo-European | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
+| mux | Bo-Ung | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| muy | Muyang | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mva | Manam | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mvn | Minaveha | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mwc | Are | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mwe | Mwera (Chimwera) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mwf | Murrinh-Patha | Southern Daly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mwp | Kala Lagaw Ya | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mwr | Marwari | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mxb | Tezoatlán Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mxp | Tlahuitoltepec Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mxq | Juquila Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mxt | Jamiltepec Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mya | Burmese | Sino-Tibetan | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 9 |
+| myk | Mamara Senoufo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| myu | Mundurukú | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| myw | Muyuw | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| myy | Macuna | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| mzz | Maiadomu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nab | Southern Nambikuára | Nambiquaran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| naf | Nabak | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nak | Nakanai | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nas | Naasioi | South Bougainville | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nbl | South Ndebele | Unclassified | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nbq | Nggem | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nca | Iyo | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nch | Central Huasteca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ncj | Northern Puebla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ncl | Michoacán Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ncu | Chumburung | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nde | North Ndebele | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ndg | Ndengereko | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ndj | Ndamba | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nds | Low German | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nep | Nepali (macrolanguage) | Unclassified | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| nfa | Dhao | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ngp | Ngulu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ngu | Guerrero Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nhe | Eastern Huasteca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nhg | Tetelcingo Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nhi | Zacatlán-Ahuacatlán-Tepetzintla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nho | Takuu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nhr | Naro | Khoe-Kwadi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nhu | Noone | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nhw | Western Huasteca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nhy | Northern Oaxaca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nif | Nek | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nii | Nii | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nij | Ngaju | Austronesian | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
+| nin | Ninzo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nko | Nkonya | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nld | Dutch | Indo-European | 0 | 1 | 0 | 6 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 0 | 0 | 2 | 0 | 23 |
+| nlg | Gela | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nna | Nyangumarta | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nno | Norwegian Nynorsk | Unclassified | 0 | 0 | 0 | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 |
+| nnq | Ngindo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| noa | Woun Meu | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nob | Norwegian Bokmål | Unclassified | 0 | 0 | 0 | 4 | 7 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 19 |
+| noe | Nimadi | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nop | Numanggang | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nor | Norwegian | Indo-European | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 4 |
+| not | Nomatsiguenga | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nou | Ewage-Notu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nov | Novial | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| npi | Nepali (individual language) | Indo-European | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 |
+| npl | Southeastern Puebla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nqo | N'Ko | Artificial Language | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
+| nsn | Nehan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nso | Pedi | Atlantic-Congo | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 |
+| nss | Nali | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ntj | Ngaanyatjarra | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ntp | Northern Tepehuan | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ntu | Natügu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nus | Nuer | Nilotic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| nuy | Nunggubuyu | Gunwinyguan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nvm | Namiae | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nwi | Southwest Tanna | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nya | Nyanja | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 |
+| nys | Nyungar | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| nyu | Nyungwe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| obo | Obo Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| oci | Occitan (post 1500) | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| okv | Orokaiva | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| omw | South Tairora | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ong | Olo | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ons | Ono | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ood | Tohono O'odham | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| opm | Oksapmin | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ori | Oriya (macrolanguage) | Unclassified | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| orm | Oromo | Unclassified | 0 | 0 | 0 | 1 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| orv | Old Russian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ory | Odia | Indo-European | 0 | 0 | 0 | 5 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 15 |
+| ote | Mezquital Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| otm | Eastern Highland Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| otn | Tenango Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| otq | Querétaro Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ots | Estado de México Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| pab | Parecís | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| pad | Paumarí | Arawan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| pag | Pangasinan | Austronesian | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| pah | Tenharim | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| pam | Pampanga | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| pan | Panjabi | Indo-European | 0 | 0 | 0 | 6 | 6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 18 |
+| pao | Northern Paiute | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| pap | Papiamento | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| pbt | Southern Pashto | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 |
+| pcm | Nigerian Pidgin | Indo-European | 0 | 0 | 0 | 1 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |
+| pes | Iranian Persian | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 |
+| pib | Yine | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| pio | Piapoco | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| pir | Piratapuyo | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| piu | Pintupi-Luritja | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| pjt | Pitjantjatjara | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| pls | San Marcos Tlacoyalco Popoloca | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| plt | Plateau Malagasy | Austronesian | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 |
+| plu | Palikúr | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| pma | Paama | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| pms | Piemontese | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| poe | San Juan Atzingo Popoloca | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| poh | Poqomchi' | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| poi | Highland Popoluca | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| pol | Polish | Indo-European | 0 | 1 | 0 | 4 | 11 | 4 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | 0 | 18 | 4 | 0 | 0 | 1 | 0 | 48 |
+| pon | Pohnpeian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| por | Portuguese | Indo-European | 0 | 1 | 0 | 4 | 9 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 1 | 5 | 3 | 0 | 0 | 1 | 0 | 30 |
+| poy | Pogolo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ppo | Folopa | Teberan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| prf | Paranan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| pri | Paicî | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| prs | Dari | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| ptp | Patep | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ptu | Bambam | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| pus | Pushto | Unclassified | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
+| pwg | Gapapaiwa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| qub | Huallaga Huánuco Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| quc | K'iche' | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| quf | Lambayeque Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| quh | South Bolivian Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| qul | North Bolivian Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| qup | Southern Pastaza Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| quy | Ayacucho Quechua | Quechuan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| quz | Cusco Quechua | Quechuan | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| qvc | Cajamarca Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| qve | Eastern Apurímac Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| qvh | Huamalíes-Dos de Mayo Huánuco Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| qvm | Margos-Yarowilca-Lauricocha Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| qvn | North Junín Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| qvs | San Martín Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| qvw | Huaylla Wanca Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| qvz | Northern Pastaza Quichua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| qwh | Huaylas Ancash Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| qxh | Panao Huánuco Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| qxn | Northern Conchucos Ancash Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| qxo | Southern Conchucos Ancash Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| rai | Ramoaaina | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| raj | Rajasthani | Unclassified | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
+| reg | Kara (Tanzania) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| rej | Rejang | Austronesian | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| rgu | Ringgou | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| rkb | Rikbaktsa | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| rmc | Carpathian Romani | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| rmy | Vlax Romani | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| rom | Romany | Unclassified | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
+| ron | Romanian | Indo-European | 0 | 1 | 0 | 5 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 3 | 1 | 0 | 0 | 0 | 0 | 19 |
+| roo | Rotokas | North Bougainville | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| rop | Kriol | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| row | Dela-Oenale | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| rro | Waima | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ruf | Luguru | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| rug | Roviana | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| run | Rundi | Atlantic-Congo | 0 | 0 | 0 | 1 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
+| rus | Russian | Indo-European | 0 | 2 | 0 | 5 | 13 | 6 | 0 | 0 | 0 | 0 | 0 | 2 | 4 | 2 | 16 | 4 | 0 | 0 | 1 | 0 | 55 |
+| rwo | Rawa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| sab | Buglere | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| sag | Sango | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| sah | Yakut | Turkic | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| san | Sanskrit | Indo-European | 0 | 0 | 0 | 5 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 |
+| sat | Santali | Austroasiatic | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |
+| sbe | Saliba | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| sbk | Safwa | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| sbs | Subiya | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| scn | Sicilian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| sco | Scots | Indo-European | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| seh | Sena | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| sey | Secoya | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| sgb | Mag-antsi Ayta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| sgz | Sursurunga | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| shi | Tachelhit | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| shj | Shatt | Dajuic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| shn | Shan | Tai-Kadai | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 |
+| shp | Shipibo-Conibo | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| sim | Mende (Papua New Guinea) | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| sin | Sinhala | Indo-European | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 7 |
+| sja | Epena | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| slk | Slovak | Indo-European | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 12 |
+| sll | Salt-Yui | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| slv | Slovenian | Indo-European | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 10 |
+| smk | Bolinao | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| smo | Samoan | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| sna | Shona | Atlantic-Congo | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 |
+| snc | Sinaugoro | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| snd | Sindhi | Indo-European | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 |
+| snn | Siona | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| snp | Siane | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| snx | Sam | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| sny | Saniyo-Hiyewe | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| som | Somali | Afro-Asiatic | 0 | 0 | 0 | 3 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 9 |
+| soq | Kanasi | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| sot | Southern Sotho | Atlantic-Congo | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 5 |
+| soy | Miyobe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| spa | Spanish | Indo-European | 0 | 2 | 0 | 4 | 13 | 4 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 2 | 13 | 4 | 0 | 0 | 2 | 0 | 48 |
+| spl | Selepet | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| spm | Akukem | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| spp | Supyire Senoufo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| sps | Saposa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| spy | Sabaot | Nilotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| sqi | Albanian | Unclassified | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
+| srd | Sardinian | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| sri | Siriano | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| srm | Saramaccan | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| srn | Sranan Tongo | Indo-European | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
+| srp | Serbian | Indo-European | 0 | 0 | 0 | 4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 9 |
+| srq | Sirionó | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ssd | Siroi | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ssg | Seimat | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ssw | Swati | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 7 |
+| ssx | Samberigi | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| stp | Southeastern Tepehuan | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| sua | Sulka | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| sue | Suena | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| sun | Sundanese | Austronesian | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 9 |
+| sus | Susu | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| suz | Sunwar | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| svk | Slovakian Sign Language | Sign Language | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| swa | Swahili (macrolanguage) | Atlantic-Congo | 0 | 1 | 0 | 1 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 16 |
+| swe | Swedish | Indo-European | 0 | 1 | 0 | 4 | 8 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 4 | 0 | 0 | 0 | 0 | 0 | 23 |
+| swg | Swabian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| swh | Swahili (individual language) | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 |
+| swp | Suau | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| sxb | Suba | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| szl | Silesian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| tac | Lowland Tarahumara | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tah | Tahitian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| taj | Eastern Tamang | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tam | Tamil | Dravidian | 0 | 0 | 0 | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 3 | 1 | 0 | 0 | 0 | 0 | 21 |
+| taq | Tamasheq | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| tat | Tatar | Turkic | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
+| tav | Tatuyo | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| taw | Tai | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tbc | Takia | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tbf | Mandara | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tbg | North Tairora | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tbo | Tawala | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tbz | Ditammari | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tca | Ticuna | Ticuna-Yuri | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tcs | Torres Strait Creole | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tcz | Thado Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tdt | Tetun Dili | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tee | Huehuetla Tepehua | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tel | Telugu | Dravidian | 0 | 1 | 0 | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | 2 | 0 | 0 | 0 | 0 | 25 |
+| ter | Tereno | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tet | Tetum | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tew | Tewa (USA) | Kiowa-Tanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tfr | Teribe | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tgk | Tajik | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 |
+| tgl | Tagalog | Austronesian | 0 | 0 | 0 | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 |
+| tgo | Sudest | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tgp | Tangoa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tha | Thai | Tai-Kadai | 0 | 1 | 0 | 4 | 8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 6 | 0 | 0 | 0 | 0 | 0 | 22 |
+| tif | Tifal | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tim | Timbe | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tir | Tigrinya | Afro-Asiatic | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 |
+| tiw | Tiwi | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tiy | Tiruray | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tke | Takwane | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tku | Upper Necaxa Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tlf | Telefol | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tmd | Haruai | Piawi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tna | Tacana | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tnc | Tanimuca-Retuarã | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tnk | Kwamera | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tnn | North Tanna | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tnp | Whitesands | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| toc | Coyutla Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tod | Toma | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tof | Gizrra | Eastern Trans-Fly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| toj | Tojolabal | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ton | Tonga (Tonga Islands) | Austronesian | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
+| too | Xicotepec De Juárez Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| top | Papantla Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tos | Highland Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tpa | Taupota | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tpi | Tok Pisin | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
+| tpt | Tlachichilco Tepehua | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tpz | Tinputz | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| trc | Copala Triqui | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tsn | Tswana | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 7 |
+| tso | Tsonga | Atlantic-Congo | 0 | 0 | 0 | 1 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 7 |
+| tsw | Tsishingini | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ttc | Tektiteko | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tte | Bwanabwana | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tuc | Mutu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tue | Tuyuca | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tuf | Central Tunebo | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tuk | Turkmen | Turkic | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
+| tum | Tumbuka | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| tuo | Tucano | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tur | Turkish | Turkic | 0 | 3 | 0 | 4 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 3 | 2 | 0 | 0 | 1 | 0 | 24 |
+| tvk | Southeast Ambrym | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| twi | Twi | Unclassified | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
+| txq | Tii | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| txu | Kayapó | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tyv | Tuvinian | Turkic | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tzj | Tz'utujil | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tzl | Talossan | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| tzm | Central Atlas Tamazight | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| tzo | Tzotzil | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ubr | Ubir | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ubu | Umbu-Ungu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| udu | Uduk | Koman | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| uig | Uighur | Turkic | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |
+| ukr | Ukrainian | Indo-European | 0 | 1 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 9 |
+| uli | Ulithian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ulk | Meriam Mir | Eastern Trans-Fly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| umb | Umbundu | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| upv | Uripiv-Wala-Rano-Atchin | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ura | Urarina | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| urb | Urubú-Kaapor | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| urd | Urdu | Indo-European | 0 | 0 | 0 | 7 | 8 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 19 |
+| uri | Urim | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| urt | Urat | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| urw | Sop | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| usa | Usarufa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| usp | Uspanteco | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| uvh | Uri | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| uvl | Lote | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| uzb | Uzbek | Unclassified | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
+| uzn | Northern Uzbek | Turkic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 |
+| vec | Venetian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| ven | Venda | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
+| vid | Vidunda | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| vie | Vietnamese | Austroasiatic | 0 | 2 | 0 | 5 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 5 | 0 | 0 | 0 | 0 | 0 | 20 |
+| viv | Iduna | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| vmy | Ayautla Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| waj | Waffa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| wal | Wolaytta | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| wap | Wapishana | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| war | Waray (Philippines) | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 5 |
+| wat | Kaninuwa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| wbi | Vwanji | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| wbp | Warlpiri | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| wed | Wedau | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| wer | Weri | Kunimaipan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| wim | Wik-Mungkan | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| wiu | Wiru | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| wiv | Vitu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| wln | Walloon | Indo-European | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| wmt | Walmajarri | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| wmw | Mwani | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| wnc | Wantoat | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| wnu | Usan | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| wol | Wolof | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 |
+| wos | Hanga Hundi | Ndu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| wrk | Garrwa | Garrwan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| wro | Worrorra | Worrorran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| wrs | Waris | Border | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| wsk | Waskia | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| wuu | Wu Chinese | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| wuv | Wuvulu-Aua | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| xav | Xavánte | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| xbi | Kombio | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| xed | Hdi | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| xho | Xhosa | Atlantic-Congo | 0 | 0 | 0 | 3 | 3 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 10 |
+| xla | Kamula | Kamula-Elevala | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| xnn | Northern Kankanay | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| xon | Konkomba | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| xsi | Sio | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| xtd | Diuxi-Tilantongo Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| xtm | Magdalena Peñasco Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| yaa | Yaminahua | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| yad | Yagua | Peba-Yagua | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| yal | Yalunka | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| yap | Yapese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| yaq | Yaqui | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| yby | Yaweyuha | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ycn | Yucuna | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ydd | Eastern Yiddish | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
+| yid | Yiddish | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| yka | Yakan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| yle | Yele | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| yml | Iamalele | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| yon | Yongkom | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| yor | Yoruba | Atlantic-Congo | 0 | 0 | 0 | 4 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 16 |
+| yrb | Yareba | Yareban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| yre | Yaouré | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| yss | Yessan-Mayo | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| yue | Yue Chinese | Sino-Tibetan | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
+| yuj | Karkar-Yuri | Pauwasi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| yut | Yopno | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| yuw | Yau (Morobe Province) | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| yva | Yawa | Yawa-Saweru | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zaa | Sierra de Juárez Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zab | Western Tlacolula Valley Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zac | Ocotlán Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zad | Cajonos Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zai | Isthmus Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zaj | Zaramo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zam | Miahuatlán Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zao | Ozolotepec Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zap | Zapotec | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zar | Rincón Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zas | Santo Domingo Albarradas Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zat | Tabaa Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zav | Yatzachi Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zaw | Mitla Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zca | Coatecas Altas Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zga | Kinga | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zho | Chinese | Unclassified | 0 | 2 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 13 | 0 | 0 | 0 | 0 | 0 | 23 |
+| zia | Zia | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ziw | Zigula | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zlm | Malay (individual language) | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zos | Francisco León Zoque | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zpc | Choapan Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zpl | Lachixío Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zpm | Mixtepec Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zpo | Amatlán Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zpq | Zoogocho Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zpu | Yalálag Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zpv | Chichicapan Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zpz | Texmelucan Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zsm | Standard Malay | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 5 |
+| zsr | Southern Rincon Zapotec | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| ztq | Quioquitani-Quierí Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zty | Yatee Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| zul | Zulu | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 7 |
+| zyp | Zyphe Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
+| Total | None | None | None | 9 | 114 | 4 | 1398 | 836 | 311 | 21 | 5 | 1 | 6 | 3 | 28 | 91 | 55 | 507 | 88 | 2 | 2 | 24 | 24 |
diff --git a/mteb/abstasks/Image/AbsTaskAny2AnyMultiChoice.py b/mteb/abstasks/Image/AbsTaskAny2AnyMultiChoice.py
new file mode 100644
index 0000000000..64ca450c4d
--- /dev/null
+++ b/mteb/abstasks/Image/AbsTaskAny2AnyMultiChoice.py
@@ -0,0 +1,462 @@
+from __future__ import annotations
+
+import json
+import logging
+import os
+from collections import defaultdict
+from pathlib import Path
+from time import time
+from typing import Any
+
+import tqdm
+from datasets import Features, Value, load_dataset
+from PIL import Image
+
+from mteb.abstasks.AbsTask import AbsTask, ScoresDict
+
+from ...evaluation.evaluators import Any2AnyMultiChoiceEvaluator
+
+logger = logging.getLogger(__name__)
+
+
+class HFDataLoader:
+ def __init__(
+ self,
+ hf_repo: str | None = None,
+ hf_repo_qrels: str | None = None,
+ data_folder: str | None = None,
+ prefix: str | None = None,
+ corpus_file: str = "corpus.jsonl",
+ query_file: str = "queries.jsonl",
+ qrels_folder: str = "qrels",
+ qrels_file: str = "",
+ streaming: bool = False,
+ keep_in_memory: bool = False,
+ ):
+ self.corpus = {}
+ self.queries = {}
+ self.qrels = {}
+ self.hf_repo = hf_repo
+ if hf_repo:
+ # By default fetch qrels from same repo not a second repo with "-qrels" like in original
+ self.hf_repo_qrels = hf_repo_qrels if hf_repo_qrels else hf_repo
+ else:
+ # data folder would contain these files:
+ # (1) fiqa/corpus.jsonl (format: jsonlines)
+ # (2) fiqa/queries.jsonl (format: jsonlines)
+ # (3) fiqa/qrels/test.tsv (format: tsv ("\t"))
+ if prefix:
+ query_file = prefix + "-" + query_file
+ qrels_folder = prefix + "-" + qrels_folder
+
+ self.corpus_file = (
+ os.path.join(data_folder, corpus_file) if data_folder else corpus_file
+ )
+ self.query_file = (
+ os.path.join(data_folder, query_file) if data_folder else query_file
+ )
+ self.qrels_folder = (
+ os.path.join(data_folder, qrels_folder) if data_folder else None
+ )
+ self.qrels_file = qrels_file
+ self.streaming = streaming
+ self.keep_in_memory = keep_in_memory
+
+ @staticmethod
+ def check(fIn: str, ext: str):
+ if not os.path.exists(fIn):
+ raise ValueError(f"File {fIn} not present! Please provide accurate file.")
+
+ if not fIn.endswith(ext):
+ raise ValueError(f"File {fIn} must be present with extension {ext}")
+
+ def load(
+ self, split="test"
+ ) -> tuple[
+ dict[str, dict[str, str | Image.Image]],
+ dict[str, dict[str, str | Image.Image]],
+ dict[str, dict[str, int]],
+ ]:
+ if not self.hf_repo:
+ self.qrels_file = os.path.join(self.qrels_folder, split + ".tsv")
+ self.check(fIn=self.corpus_file, ext="jsonl")
+ self.check(fIn=self.query_file, ext="jsonl")
+ self.check(fIn=self.qrels_file, ext="tsv")
+
+ if not len(self.corpus):
+ logger.info("Loading Corpus...")
+ self._load_corpus()
+ logger.info("Loaded %d %s Documents.", len(self.corpus), split.upper())
+ logger.info("Doc Example: %s", self.corpus[0])
+
+ if not len(self.queries):
+ logger.info("Loading Queries...")
+ self._load_queries(split)
+
+ self._load_qrels(split)
+ # filter queries with no qrels
+ qrels_dict = defaultdict(dict)
+
+ def qrels_dict_init(row):
+ qrels_dict[row["query-id"]][row["corpus-id"]] = int(row["score"])
+
+ self.qrels.map(qrels_dict_init)
+ self.qrels = qrels_dict
+ self.queries = self.queries.filter(lambda x: x["id"] in self.qrels)
+ logger.info("Loaded %d %s Queries.", len(self.queries), split.upper())
+ logger.info("Query Example: %s", self.queries[0])
+
+ return self.corpus, self.queries, self.qrels
+
+ def load_corpus(self) -> dict[str, dict[str, str]]:
+ if not self.hf_repo:
+ self.check(fIn=self.corpus_file, ext="jsonl")
+
+ if not len(self.corpus):
+ logger.info("Loading Corpus...")
+ self._load_corpus()
+ logger.info("Loaded %d %s Documents.", len(self.corpus))
+ logger.info("Doc Example: %s", self.corpus[0])
+
+ return self.corpus
+
+ def _load_corpus(self):
+ if self.hf_repo:
+ corpus_ds = load_dataset(
+ self.hf_repo,
+ "corpus",
+ keep_in_memory=self.keep_in_memory,
+ streaming=self.streaming,
+ )["corpus"]
+ else:
+ corpus_ds = load_dataset(
+ "json",
+ data_files=self.corpus_file,
+ streaming=self.streaming,
+ keep_in_memory=self.keep_in_memory,
+ )
+ self.corpus = corpus_ds
+
+ def _load_queries(self, split):
+ if self.hf_repo:
+ queries_ds = load_dataset(
+ self.hf_repo,
+ "query",
+ keep_in_memory=self.keep_in_memory,
+ streaming=self.streaming,
+ )[split]
+ else:
+ queries_ds = load_dataset(
+ "json",
+ data_files=self.query_file,
+ streaming=self.streaming,
+ keep_in_memory=self.keep_in_memory,
+ )
+ self.queries = queries_ds
+
+ def _load_qrels(self, split):
+ if self.hf_repo:
+ qrels_ds = load_dataset(
+ self.hf_repo_qrels,
+ "qrels",
+ keep_in_memory=self.keep_in_memory,
+ streaming=self.streaming,
+ )[split]
+ else:
+ qrels_ds = load_dataset(
+ "csv",
+ data_files=self.qrels_file,
+ delimiter="\t",
+ keep_in_memory=self.keep_in_memory,
+ )
+
+ if "Q0" in qrels_ds.column_names:
+ qrels_ds = qrels_ds.remove_columns("Q0")
+ features = Features(
+ {
+ "query-id": Value("string"),
+ "corpus-id": Value("string"),
+ "score": Value("float"),
+ }
+ )
+ # Some datasets may have extra columns, e.g. `difficulty` in qrels for FORB.
+ qrels_ds = qrels_ds.select_columns(["query-id", "corpus-id", "score"]).cast(
+ features
+ )
+ self.qrels = qrels_ds
+
+
+class AbsTaskAny2AnyMultiChoice(AbsTask):
+ """Abstract class for Any2Any multiple choice experiments
+
+ This is NOT a retrieval task: there is one correct answer among a set of candidates, which are a subset of the corpus, indicated in qrels with a relevance of 0
+
+ Child-classes must implement the following properties:
+
+ self.corpus: dict[str, dict[str, str]]
+ Semantically, it should contain dict[split_name, dict[sample_id, dict[str, str]]]
+ E.g. {"test": {"document_one": {"_id": "d1", "title": "title", "text": "text"}}}
+
+ self.queries: dict[str, dict[str, Union[str, List[str]]]]
+ Semantically, it should contain dict[split_name, dict[sample_id, str]] or dict[split_name, dict[sample_id, List[str]]] for conversations
+ E.g. {"test": {"q1": "query"}}
+ or {"test": {"q1": ["turn1", "turn2", "turn3"]}}
+
+ self.relevant_docs: dict[str, dict[str, dict[str, int]]]
+ Semantically, it should contain dict[split_name, dict[sample_id, dict[doc_id, score]]]
+ E.g.: {"test": {"q1": {"document_one": 1}}} for hard positive samples (the correct choice)
+ E.g.: {"test": {"q1": {"document_two": 0}}} for hard negative samples (incorrect choices from the same query)
+ """
+
+ ignore_identical_ids: bool = False
+ skip_first_result: bool = False
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+
+ def load_data(self, **kwargs):
+ if self.data_loaded:
+ return
+ self.corpus, self.queries, self.relevant_docs = {}, {}, {}
+ dataset_path = self.metadata_dict["dataset"]["path"]
+
+ for split in kwargs.get("eval_splits", self.metadata_dict["eval_splits"]):
+ corpus, queries, qrels = HFDataLoader(
+ hf_repo=dataset_path,
+ streaming=False,
+ keep_in_memory=False,
+ ).load(split=split)
+ # directly pass in corpus and queries datasets now to prevent loading into memory
+ # queries = {query["id"]: query for query in queries}
+ # corpus = {doc["id"]: doc for doc in corpus}
+ self.corpus[split], self.queries[split], self.relevant_docs[split] = (
+ corpus,
+ queries,
+ qrels,
+ )
+
+ self.data_loaded = True
+
+ def evaluate(
+ self,
+ model,
+ split: str = "test",
+ *,
+ encode_kwargs: dict[str, Any] = {},
+ **kwargs,
+ ):
+ retriever = Any2AnyMultiChoiceEvaluator(
+ retriever=model,
+ task_name=self.metadata.name,
+ encode_kwargs=encode_kwargs,
+ **kwargs,
+ )
+
+ scores = {}
+ hf_subsets = list(self.hf_subsets) if self.is_multilingual else ["default"]
+
+ for hf_subset in hf_subsets:
+ logger.info(f"Subset: {hf_subset}")
+
+ if hf_subset == "default":
+ corpus, queries, relevant_docs = (
+ self.corpus[split],
+ self.queries[split],
+ self.relevant_docs[split],
+ )
+ else:
+ corpus, queries, relevant_docs = (
+ self.corpus[hf_subset][split],
+ self.queries[hf_subset][split],
+ self.relevant_docs[hf_subset][split],
+ )
+ scores[hf_subset] = self._evaluate_subset(
+ retriever, corpus, queries, relevant_docs, hf_subset, **kwargs
+ )
+ return scores
+
+ def _evaluate_subset(
+ self, retriever, corpus, queries, relevant_docs, hf_subset: str, **kwargs
+ ):
+ start_time = time()
+ results = retriever(corpus, queries, relevant_docs)
+ end_time = time()
+ logger.info(f"Time taken to retrieve: {end_time - start_time:.2f} seconds")
+
+ save_predictions = kwargs.get("save_predictions", False)
+ export_errors = kwargs.get("export_errors", False)
+ if save_predictions or export_errors:
+ output_folder = Path(kwargs.get("output_folder", "results"))
+ if not os.path.isdir(output_folder):
+ os.makedirs(output_folder)
+
+ if save_predictions:
+ top_k = kwargs.get("top_k", None)
+ if top_k is not None:
+ for qid in list(results.keys()):
+ doc_ids = set(
+ sorted(
+ results[qid], key=lambda x: results[qid][x], reverse=True
+ )[:top_k]
+ )
+ results[qid] = {
+ k: v for k, v in results[qid].items() if k in doc_ids
+ }
+ qrels_save_path = (
+ output_folder / f"{self.metadata.name}_{hf_subset}_predictions.json"
+ )
+
+ with open(qrels_save_path, "w") as f:
+ json.dump(results, f)
+
+ ndcg, _map, recall, precision, cv_recall, naucs = retriever.evaluate(
+ relevant_docs,
+ results,
+ retriever.k_values,
+ ignore_identical_ids=self.ignore_identical_ids,
+ skip_first_result=self.skip_first_result,
+ )
+ mrr, naucs_mrr = retriever.evaluate_custom(
+ relevant_docs, results, retriever.k_values, "mrr"
+ )
+ scores = {
+ **{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()},
+ **{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()},
+ **{f"recall_at_{k.split('@')[1]}": v for (k, v) in recall.items()},
+ **{f"cv_recall_at_{k.split('@')[1]}": v for (k, v) in cv_recall.items()},
+ **{f"precision_at_{k.split('@')[1]}": v for (k, v) in precision.items()},
+ **{f"mrr_at_{k.split('@')[1]}": v for (k, v) in mrr.items()},
+ **{
+ k.replace("@", "_at_").replace("_P", "_precision").lower(): v
+ for k, v in naucs.items()
+ },
+ **{
+ k.replace("@", "_at_").replace("_P", "_precision").lower(): v
+ for k, v in naucs_mrr.items()
+ },
+ "accuracy": recall["Recall@1"],
+ }
+ self._add_main_score(scores)
+
+ if export_errors:
+ errors = {}
+
+ top_k = kwargs.get("top_k", 1)
+ if not save_predictions and top_k == 1:
+ for qid in results.keys():
+ doc_scores = results[qid]
+ sorted_docs = sorted(
+ doc_scores.items(), key=lambda x: x[1], reverse=True
+ )[:top_k]
+ results[qid] = dict(sorted_docs)
+ for qid, retrieved_docs in results.items():
+ expected_docs = relevant_docs[qid]
+ false_positives = [
+ doc for doc in retrieved_docs if doc not in expected_docs
+ ]
+ false_negatives = [
+ doc for doc in expected_docs if doc not in retrieved_docs
+ ]
+ if false_positives or false_negatives:
+ errors[qid] = {
+ "false_positives": false_positives,
+ "false_negatives": false_negatives,
+ }
+
+ errors_save_path = (
+ output_folder / f"{self.metadata.name}_{hf_subset}_errors.json"
+ )
+ with open(errors_save_path, "w") as f:
+ json.dump(errors, f)
+
+ return scores
+
+ def _add_main_score(self, scores: ScoresDict) -> None:
+ scores["main_score"] = scores[self.metadata.main_score]
+
+ def _calculate_metrics_from_split(
+ self, split: str, hf_subset: str | None = None, compute_overall: bool = False
+ ):
+ pass
+
+ def calculate_metadata_metrics(self) -> None:
+ self.load_data()
+
+ all_details = {}
+ pbar_split = tqdm.tqdm(
+ self.metadata_dict["eval_splits"], desc="Processing Splits..."
+ )
+ for split in pbar_split:
+ pbar_split.set_postfix_str(f"Split: {split}")
+ logger.info(f"Processing metadata for split {split}")
+ all_details[split] = {}
+ if self.is_multilingual:
+ pbar_lang = tqdm.tqdm(
+ self.relevant_docs.keys(), desc="Processing Languages..."
+ )
+ for lang in pbar_lang:
+ pbar_lang.set_postfix_str(f"Language: {lang}")
+ logger.info(f"Processing metadata for language {lang}")
+ split_details = process_language(
+ self.relevant_docs[lang][split],
+ self.queries[lang][split],
+ self.corpus[lang][split],
+ lang,
+ )
+ all_details[split][lang] = split_details
+ else:
+ split_details = process_language(
+ self.relevant_docs[split], self.queries[split], self.corpus[split]
+ )
+ all_details[split] = split_details
+
+ return all_details
+
+
+def process_language(relevant_docs, queries, corpus, lang=None):
+ """We want to get three pieces of information:
+ - the number of documents (and their char length) in the corpus
+ - the number of queries (and their char length)
+ - the average number of relevant documents per query
+ """
+ query_len, doc_len = calculate_length(queries, corpus)
+ num_documents = len(corpus)
+ num_queries = len(queries)
+
+ # number of qrels that are not 0
+ num_qrels_non_zero = sum(
+ sum(1 for doc_id in docs if docs[doc_id] != 0)
+ for docs in relevant_docs.values()
+ )
+ qrels_per_doc = num_qrels_non_zero / num_queries if num_queries else 0
+
+ language_description = f" for language {lang}" if lang else ""
+ logger.info(f"Average document character length{language_description} is {doc_len}")
+ logger.info(f"Average query character length{language_description} is {query_len}")
+ logger.info(f"Number of documents{language_description} is {num_documents}")
+ logger.info(f"Number of queries{language_description} is {num_queries}")
+ logger.info(
+ f"Average number of relevant documents per query{language_description} is {qrels_per_doc}"
+ )
+ return {
+ "average_document_length": doc_len,
+ "average_query_length": query_len,
+ "num_documents": num_documents,
+ "num_queries": num_queries,
+ "average_relevant_docs_per_query": qrels_per_doc,
+ }
+
+
+def calculate_length(queries, corpus):
+ queries_lens = []
+ doc_lens = []
+ for query in queries.values():
+ queries_lens.append(len(query))
+
+ for doc in corpus.values():
+ if isinstance(doc, Image.Image):
+ doc_lens.append(1.0) # for image append 1. Can perhaps be removed.
+
+ doc_len = sum(doc_lens) / len(doc_lens) if doc_lens else 0
+ query_len = sum(queries_lens) / len(queries_lens) if queries_lens else 0
+ return query_len, doc_len
diff --git a/mteb/abstasks/Image/AbsTaskAny2AnyRetrieval.py b/mteb/abstasks/Image/AbsTaskAny2AnyRetrieval.py
new file mode 100644
index 0000000000..9913370666
--- /dev/null
+++ b/mteb/abstasks/Image/AbsTaskAny2AnyRetrieval.py
@@ -0,0 +1,457 @@
+from __future__ import annotations
+
+import json
+import logging
+import os
+from collections import defaultdict
+from pathlib import Path
+from time import time
+from typing import Any
+
+import tqdm
+from datasets import Features, Value, load_dataset
+from PIL import Image
+
+from ...evaluation.evaluators import Any2AnyRetrievalEvaluator
+from ..AbsTask import AbsTask, ScoresDict
+
+logger = logging.getLogger(__name__)
+
+
+class HFDataLoader:
+ def __init__(
+ self,
+ hf_repo: str | None = None,
+ hf_repo_qrels: str | None = None,
+ data_folder: str | None = None,
+ prefix: str | None = None,
+ corpus_file: str = "corpus.jsonl",
+ query_file: str = "queries.jsonl",
+ qrels_folder: str = "qrels",
+ qrels_file: str = "",
+ streaming: bool = False,
+ keep_in_memory: bool = False,
+ ):
+ self.corpus = {}
+ self.queries = {}
+ self.qrels = {}
+ self.hf_repo = hf_repo
+ if hf_repo:
+ # By default fetch qrels from same repo not a second repo with "-qrels" like in original
+ self.hf_repo_qrels = hf_repo_qrels if hf_repo_qrels else hf_repo
+ else:
+ # data folder would contain these files:
+ # (1) fiqa/corpus.jsonl (format: jsonlines)
+ # (2) fiqa/queries.jsonl (format: jsonlines)
+ # (3) fiqa/qrels/test.tsv (format: tsv ("\t"))
+ if prefix:
+ query_file = prefix + "-" + query_file
+ qrels_folder = prefix + "-" + qrels_folder
+
+ self.corpus_file = (
+ os.path.join(data_folder, corpus_file) if data_folder else corpus_file
+ )
+ self.query_file = (
+ os.path.join(data_folder, query_file) if data_folder else query_file
+ )
+ self.qrels_folder = (
+ os.path.join(data_folder, qrels_folder) if data_folder else None
+ )
+ self.qrels_file = qrels_file
+ self.streaming = streaming
+ self.keep_in_memory = keep_in_memory
+
+ @staticmethod
+ def check(fIn: str, ext: str):
+ if not os.path.exists(fIn):
+ raise ValueError(f"File {fIn} not present! Please provide accurate file.")
+
+ if not fIn.endswith(ext):
+ raise ValueError(f"File {fIn} must be present with extension {ext}")
+
+ def load(
+ self, split="test"
+ ) -> tuple[
+ dict[str, dict[str, str | Image.Image]],
+ dict[str, dict[str, str | Image.Image]],
+ dict[str, dict[str, int]],
+ ]:
+ if not self.hf_repo:
+ self.qrels_file = os.path.join(self.qrels_folder, split + ".tsv")
+ self.check(fIn=self.corpus_file, ext="jsonl")
+ self.check(fIn=self.query_file, ext="jsonl")
+ self.check(fIn=self.qrels_file, ext="tsv")
+
+ if not len(self.corpus):
+ logger.info("Loading Corpus...")
+ self._load_corpus()
+ logger.info("Loaded %d %s Documents.", len(self.corpus), split.upper())
+ logger.info("Doc Example: %s", self.corpus[0])
+
+ if not len(self.queries):
+ logger.info("Loading Queries...")
+ self._load_queries(split)
+
+ self._load_qrels(split)
+ # filter queries with no qrels
+ qrels_dict = defaultdict(dict)
+
+ def qrels_dict_init(row):
+ qrels_dict[row["query-id"]][row["corpus-id"]] = int(row["score"])
+
+ self.qrels.map(qrels_dict_init)
+ self.qrels = qrels_dict
+ self.queries = self.queries.filter(lambda x: x["id"] in self.qrels)
+ logger.info("Loaded %d %s Queries.", len(self.queries), split.upper())
+ logger.info("Query Example: %s", self.queries[0])
+
+ return self.corpus, self.queries, self.qrels
+
+ def load_corpus(self) -> dict[str, dict[str, str]]:
+ if not self.hf_repo:
+ self.check(fIn=self.corpus_file, ext="jsonl")
+
+ if not len(self.corpus):
+ logger.info("Loading Corpus...")
+ self._load_corpus()
+ logger.info("Loaded %d %s Documents.", len(self.corpus))
+ logger.info("Doc Example: %s", self.corpus[0])
+
+ return self.corpus
+
+ def _load_corpus(self):
+ if self.hf_repo:
+ corpus_ds = load_dataset(
+ self.hf_repo,
+ "corpus",
+ keep_in_memory=self.keep_in_memory,
+ streaming=self.streaming,
+ )["corpus"]
+ else:
+ corpus_ds = load_dataset(
+ "json",
+ data_files=self.corpus_file,
+ streaming=self.streaming,
+ keep_in_memory=self.keep_in_memory,
+ )
+ self.corpus = corpus_ds
+
+ def _load_queries(self, split):
+ if self.hf_repo:
+ queries_ds = load_dataset(
+ self.hf_repo,
+ "query",
+ keep_in_memory=self.keep_in_memory,
+ streaming=self.streaming,
+ )[split]
+ else:
+ queries_ds = load_dataset(
+ "json",
+ data_files=self.query_file,
+ streaming=self.streaming,
+ keep_in_memory=self.keep_in_memory,
+ )
+ self.queries = queries_ds
+
+ def _load_qrels(self, split):
+ if self.hf_repo:
+ qrels_ds = load_dataset(
+ self.hf_repo_qrels,
+ "qrels",
+ keep_in_memory=self.keep_in_memory,
+ streaming=self.streaming,
+ )[split]
+ else:
+ qrels_ds = load_dataset(
+ "csv",
+ data_files=self.qrels_file,
+ delimiter="\t",
+ keep_in_memory=self.keep_in_memory,
+ )
+
+ if "Q0" in qrels_ds.column_names:
+ qrels_ds = qrels_ds.remove_columns("Q0")
+ features = Features(
+ {
+ "query-id": Value("string"),
+ "corpus-id": Value("string"),
+ "score": Value("float"),
+ }
+ )
+ # Some datasets may have extra columns, e.g. `difficulty` in qrels for FORB.
+ qrels_ds = qrels_ds.select_columns(["query-id", "corpus-id", "score"]).cast(
+ features
+ )
+ self.qrels = qrels_ds
+
+
+class AbsTaskAny2AnyRetrieval(AbsTask):
+ """Abstract class for retrieval experiments.
+
+ Child-classes must implement the following properties:
+
+ self.corpus: dict[str, dict[str, str]]
+ Semantically, it should contain dict[split_name, dict[sample_id, dict[str, str]]]
+ E.g. {"test": {"document_one": {"_id": "d1", "title": "title", "text": "text"}}}
+
+ self.queries: dict[str, dict[str, Union[str, List[str]]]]
+ Semantically, it should contain dict[split_name, dict[sample_id, str]] or dict[split_name, dict[sample_id, List[str]]] for conversations
+ E.g. {"test": {"q1": "query"}}
+ or {"test": {"q1": ["turn1", "turn2", "turn3"]}}
+
+ self.relevant_docs: dict[str, dict[str, dict[str, int]]]
+ Semantically, it should contain dict[split_name, dict[sample_id, dict[doc_id, score]]]
+ E.g.: {"test": {"q1": {"document_one": 1}}}
+ """
+
+ ignore_identical_ids: bool = False
+ skip_first_result: bool = False
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+
+ def load_data(self, **kwargs):
+ if self.data_loaded:
+ return
+ self.corpus, self.queries, self.relevant_docs = {}, {}, {}
+ dataset_path = self.metadata_dict["dataset"]["path"]
+
+ for split in kwargs.get("eval_splits", self.metadata_dict["eval_splits"]):
+ corpus, queries, qrels = HFDataLoader(
+ hf_repo=dataset_path,
+ streaming=False,
+ keep_in_memory=False,
+ ).load(split=split)
+ # directly pass in corpus and queries datasets now to prevent loading into memory
+ # queries = {query["id"]: query for query in queries}
+ # corpus = {doc["id"]: doc for doc in corpus}
+ self.corpus[split], self.queries[split], self.relevant_docs[split] = (
+ corpus,
+ queries,
+ qrels,
+ )
+
+ self.data_loaded = True
+
+ def evaluate(
+ self,
+ model,
+ split: str = "test",
+ *,
+ encode_kwargs: dict[str, Any] = {},
+ **kwargs,
+ ):
+ retriever = Any2AnyRetrievalEvaluator(
+ retriever=model,
+ task_name=self.metadata.name,
+ encode_kwargs=encode_kwargs,
+ **kwargs,
+ )
+
+ scores = {}
+ hf_subsets = list(self.hf_subsets) if self.is_multilingual else ["default"]
+
+ for hf_subset in hf_subsets:
+ logger.info(f"Subset: {hf_subset}")
+
+ if hf_subset == "default":
+ corpus, queries, relevant_docs = (
+ self.corpus[split],
+ self.queries[split],
+ self.relevant_docs[split],
+ )
+ else:
+ corpus, queries, relevant_docs = (
+ self.corpus[hf_subset][split],
+ self.queries[hf_subset][split],
+ self.relevant_docs[hf_subset][split],
+ )
+ scores[hf_subset] = self._evaluate_subset(
+ retriever, corpus, queries, relevant_docs, hf_subset, **kwargs
+ )
+ return scores
+
+ def _evaluate_subset(
+ self, retriever, corpus, queries, relevant_docs, hf_subset: str, **kwargs
+ ):
+ start_time = time()
+ results = retriever(corpus, queries)
+ end_time = time()
+ logger.info(f"Time taken to retrieve: {end_time - start_time:.2f} seconds")
+
+ save_predictions = kwargs.get("save_predictions", False)
+ export_errors = kwargs.get("export_errors", False)
+ if save_predictions or export_errors:
+ output_folder = Path(kwargs.get("output_folder", "results"))
+ if not os.path.isdir(output_folder):
+ os.makedirs(output_folder)
+
+ if save_predictions:
+ top_k = kwargs.get("top_k", None)
+ if top_k is not None:
+ for qid in list(results.keys()):
+ doc_ids = set(
+ sorted(
+ results[qid], key=lambda x: results[qid][x], reverse=True
+ )[:top_k]
+ )
+ results[qid] = {
+ k: v for k, v in results[qid].items() if k in doc_ids
+ }
+ qrels_save_path = (
+ output_folder / f"{self.metadata.name}_{hf_subset}_predictions.json"
+ )
+
+ with open(qrels_save_path, "w") as f:
+ json.dump(results, f)
+
+ ndcg, _map, recall, precision, cv_recall, naucs = retriever.evaluate(
+ relevant_docs,
+ results,
+ retriever.k_values,
+ ignore_identical_ids=self.ignore_identical_ids,
+ skip_first_result=self.skip_first_result,
+ )
+ mrr, naucs_mrr = retriever.evaluate_custom(
+ relevant_docs, results, retriever.k_values, "mrr"
+ )
+ scores = {
+ **{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()},
+ **{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()},
+ **{f"recall_at_{k.split('@')[1]}": v for (k, v) in recall.items()},
+ **{f"cv_recall_at_{k.split('@')[1]}": v for (k, v) in cv_recall.items()},
+ **{f"precision_at_{k.split('@')[1]}": v for (k, v) in precision.items()},
+ **{f"mrr_at_{k.split('@')[1]}": v for (k, v) in mrr.items()},
+ **{
+ k.replace("@", "_at_").replace("_P", "_precision").lower(): v
+ for k, v in naucs.items()
+ },
+ **{
+ k.replace("@", "_at_").replace("_P", "_precision").lower(): v
+ for k, v in naucs_mrr.items()
+ },
+ }
+ self._add_main_score(scores)
+
+ if export_errors:
+ errors = {}
+
+ top_k = kwargs.get("top_k", 1)
+ if not save_predictions and top_k == 1:
+ for qid in results.keys():
+ doc_scores = results[qid]
+ sorted_docs = sorted(
+ doc_scores.items(), key=lambda x: x[1], reverse=True
+ )[:top_k]
+ results[qid] = dict(sorted_docs)
+ for qid, retrieved_docs in results.items():
+ expected_docs = relevant_docs[qid]
+ false_positives = [
+ doc for doc in retrieved_docs if doc not in expected_docs
+ ]
+ false_negatives = [
+ doc for doc in expected_docs if doc not in retrieved_docs
+ ]
+ if false_positives or false_negatives:
+ errors[qid] = {
+ "false_positives": false_positives,
+ "false_negatives": false_negatives,
+ }
+
+ errors_save_path = (
+ output_folder / f"{self.metadata.name}_{hf_subset}_errors.json"
+ )
+ with open(errors_save_path, "w") as f:
+ json.dump(errors, f)
+
+ return scores
+
+ def _add_main_score(self, scores: ScoresDict) -> None:
+ scores["main_score"] = scores[self.metadata.main_score]
+
+ def _calculate_metrics_from_split(
+ self, split: str, hf_subset: str | None = None, compute_overall: bool = False
+ ):
+ pass
+
+ def calculate_metadata_metrics(self) -> None:
+ self.load_data()
+
+ all_details = {}
+ pbar_split = tqdm.tqdm(
+ self.metadata_dict["eval_splits"], desc="Processing Splits..."
+ )
+ for split in pbar_split:
+ pbar_split.set_postfix_str(f"Split: {split}")
+ logger.info(f"Processing metadata for split {split}")
+ all_details[split] = {}
+ if self.is_multilingual:
+ pbar_lang = tqdm.tqdm(
+ self.relevant_docs.keys(), desc="Processing Languages..."
+ )
+ for lang in pbar_lang:
+ pbar_lang.set_postfix_str(f"Language: {lang}")
+ logger.info(f"Processing metadata for language {lang}")
+ split_details = process_language(
+ self.relevant_docs[lang][split],
+ self.queries[lang][split],
+ self.corpus[lang][split],
+ lang,
+ )
+ all_details[split][lang] = split_details
+ else:
+ split_details = process_language(
+ self.relevant_docs[split], self.queries[split], self.corpus[split]
+ )
+ all_details[split] = split_details
+
+ return all_details
+
+
+def process_language(relevant_docs, queries, corpus, lang=None):
+ """We want to get three pieces of information:
+ - the number of documents (and their char length) in the corpus
+ - the number of queries (and their char length)
+ - the average number of relevant documents per query
+ """
+ query_len, doc_len = calculate_length(queries, corpus)
+ num_documents = len(corpus)
+ num_queries = len(queries)
+
+ # number of qrels that are not 0
+ num_qrels_non_zero = sum(
+ sum(1 for doc_id in docs if docs[doc_id] != 0)
+ for docs in relevant_docs.values()
+ )
+ qrels_per_doc = num_qrels_non_zero / num_queries if num_queries else 0
+
+ language_description = f" for language {lang}" if lang else ""
+ logger.info(f"Average document character length{language_description} is {doc_len}")
+ logger.info(f"Average query character length{language_description} is {query_len}")
+ logger.info(f"Number of documents{language_description} is {num_documents}")
+ logger.info(f"Number of queries{language_description} is {num_queries}")
+ logger.info(
+ f"Average number of relevant documents per query{language_description} is {qrels_per_doc}"
+ )
+ return {
+ "average_document_length": doc_len,
+ "average_query_length": query_len,
+ "num_documents": num_documents,
+ "num_queries": num_queries,
+ "average_relevant_docs_per_query": qrels_per_doc,
+ }
+
+
+def calculate_length(queries, corpus):
+ queries_lens = []
+ doc_lens = []
+ for query in queries.values():
+ queries_lens.append(len(query))
+
+ for doc in corpus.values():
+ if isinstance(doc, Image.Image):
+ doc_lens.append(1.0) # for image append 1. Can perhaps be removed.
+
+ doc_len = sum(doc_lens) / len(doc_lens) if doc_lens else 0
+ query_len = sum(queries_lens) / len(queries_lens) if queries_lens else 0
+ return query_len, doc_len
diff --git a/mteb/abstasks/Image/AbsTaskAny2TextMultipleChoice.py b/mteb/abstasks/Image/AbsTaskAny2TextMultipleChoice.py
new file mode 100644
index 0000000000..50991b6aee
--- /dev/null
+++ b/mteb/abstasks/Image/AbsTaskAny2TextMultipleChoice.py
@@ -0,0 +1,64 @@
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from datasets import Dataset
+
+from ...encoder_interface import Encoder
+from ...evaluation.evaluators import Any2TextMultipleChoiceEvaluator
+from ..AbsTask import AbsTask, ScoresDict
+
+logger = logging.getLogger(__name__)
+
+
+class AbsTaskAny2TextMultipleChoice(AbsTask):
+ """Abstract class for Any to Text Multiple Choice tasks,
+ where the queries and be either text or image, or both.
+ This task assess interleaved encoding of queries,
+ the similarity computed between the queries and the candidate choices is ranked.
+
+ self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset.
+ """
+
+ query_modalities: list[str] | str = ["image", "text"]
+ query_column_names: dict = {"image": "image", "text": "question"}
+ label_column_name: str = "answer"
+ choices_column_name: str = "choices"
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+
+ def _add_main_score(self, scores) -> None:
+ scores["main_score"] = scores[self.metadata.main_score]
+
+ def _calculate_metrics_from_split(
+ self, split: str, hf_subset: str | None = None, compute_overall: bool = False
+ ):
+ pass
+
+ def _evaluate_subset(
+ self,
+ model: Encoder,
+ dataset: Dataset,
+ *,
+ encode_kwargs: dict[str, Any] = {},
+ **kwargs,
+ ) -> ScoresDict:
+ for modality in self.query_modalities:
+ if modality not in self.query_column_names:
+ raise KeyError(
+ f"query column name of modality {modality} is not defined"
+ )
+ evaluator = Any2TextMultipleChoiceEvaluator(
+ dataset,
+ query_modalities=self.query_modalities,
+ query_column_names=self.query_column_names,
+ label_column_name=self.label_column_name,
+ choices_column_name=self.choices_column_name,
+ task_name=self.metadata.name,
+ **kwargs,
+ )
+ scores = evaluator(model, encode_kwargs=encode_kwargs)
+ self._add_main_score(scores)
+ return scores
diff --git a/mteb/abstasks/Image/AbsTaskImageClassification.py b/mteb/abstasks/Image/AbsTaskImageClassification.py
new file mode 100644
index 0000000000..7add58296a
--- /dev/null
+++ b/mteb/abstasks/Image/AbsTaskImageClassification.py
@@ -0,0 +1,213 @@
+from __future__ import annotations
+
+import logging
+from collections import defaultdict
+from typing import Any
+
+import numpy as np
+from PIL import ImageFile
+
+from mteb.abstasks.TaskMetadata import HFSubset
+
+from ...encoder_interface import Encoder
+from ...evaluation.evaluators import (
+ ImagekNNClassificationEvaluator,
+ ImagekNNClassificationEvaluatorPytorch,
+ ImagelogRegClassificationEvaluator,
+)
+from ..AbsTask import AbsTask, ScoresDict
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+logger = logging.getLogger(__name__)
+
+
+class AbsTaskImageClassification(AbsTask):
+ """Abstract class for kNN classification tasks
+ The similarity is computed between pairs and the results are ranked.
+
+ self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It
+ must contain the following columns:
+ image: Image.Image
+ label: int
+ """
+
+ image_column_name: str = "image"
+ label_column_name: str = "label"
+
+ def __init__(
+ self,
+ method: str = "logReg",
+ n_experiments: int | None = None,
+ samples_per_label: int | None = None,
+ k: int = 3,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.method = method
+
+ # Bootstrap parameters
+ self.n_experiments: int = ( # type: ignore
+ n_experiments
+ if n_experiments is not None
+ else self.metadata_dict.get("n_experiments", 5)
+ )
+ self.samples_per_label: int = ( # type: ignore
+ samples_per_label
+ if samples_per_label is not None
+ else self.metadata_dict.get("samples_per_label", 16)
+ )
+
+ # kNN parameters
+ self.k = k
+
+ # Run metadata validation by instantiating addressing the attribute
+ # This is quite hacky. Ideally, this would be done in the constructor of
+ # each concrete task, but then we have to duplicate the __init__ method's
+ # interface.
+ if hasattr(self, "metadata"):
+ self.metadata
+
+ def _add_main_score(self, scores: dict[HFSubset, ScoresDict]) -> None:
+ scores["main_score"] = scores[self.metadata.main_score]
+
+ def _calculate_metrics_from_split(
+ self, split: str, hf_subset: str | None = None, compute_overall: bool = False
+ ):
+ pass
+
+ def evaluate(
+ self,
+ model,
+ eval_split: str = "test",
+ train_split: str = "train",
+ *,
+ encode_kwargs: dict[str, Any] = {},
+ **kwargs,
+ ) -> dict[HFSubset, ScoresDict]:
+ if not self.data_loaded:
+ self.load_data()
+
+ scores = {}
+ hf_subsets = list(self.dataset) if self.is_multilingual else ["default"]
+
+ for hf_subset in hf_subsets:
+ logger.info(
+ f"\nTask: {self.metadata.name}, split: {eval_split}, subset: {hf_subset}. Running..."
+ )
+
+ if hf_subset not in self.dataset and hf_subset == "default":
+ ds = self.dataset
+ else:
+ ds = self.dataset[hf_subset]
+ scores[hf_subset] = self._evaluate_subset(
+ model,
+ ds,
+ eval_split,
+ train_split,
+ encode_kwargs=encode_kwargs,
+ **kwargs,
+ )
+ self._add_main_score(scores[hf_subset])
+
+ return scores
+
+ def _evaluate_subset(
+ self,
+ model: Encoder,
+ dataset,
+ eval_split: str = "test",
+ train_split: str = "train",
+ encode_kwargs: dict[str, Any] = {},
+ **kwargs,
+ ) -> ScoresDict:
+ train_split = dataset[train_split]
+ eval_split = dataset[eval_split]
+ params = {"k": self.k}
+ params.update(kwargs)
+
+ scores = []
+ test_cache, idxs = (
+ None,
+ None,
+ ) # we store idxs to make the shuffling reproducible
+ for i in range(self.n_experiments):
+ logger.info(
+ "=" * 10 + f" Experiment {i+1}/{self.n_experiments} " + "=" * 10
+ )
+ # Bootstrap `self.samples_per_label` samples per label for each split
+ undersampled_train, idxs = self._undersample_data(
+ train_split,
+ self.label_column_name,
+ self.samples_per_label,
+ idxs=idxs,
+ )
+
+ if self.method == "kNN":
+ evaluator = ImagekNNClassificationEvaluator(
+ undersampled_train,
+ eval_split,
+ self.image_column_name,
+ self.label_column_name,
+ task_name=self.metadata.name,
+ encode_kwargs=encode_kwargs,
+ **params,
+ )
+ elif self.method == "kNN-pytorch":
+ evaluator = ImagekNNClassificationEvaluatorPytorch(
+ undersampled_train,
+ eval_split,
+ self.image_column_name,
+ self.label_column_name,
+ task_name=self.metadata.name,
+ encode_kwargs=encode_kwargs,
+ **params,
+ )
+ elif self.method == "logReg":
+ evaluator = ImagelogRegClassificationEvaluator(
+ undersampled_train,
+ eval_split,
+ self.image_column_name,
+ self.label_column_name,
+ task_name=self.metadata.name,
+ encode_kwargs=encode_kwargs,
+ **params,
+ )
+ else:
+ raise ValueError(f"Method {self.method} not supported")
+
+ scores_exp, test_cache = evaluator(model, test_cache=test_cache)
+ scores.append(scores_exp)
+
+ avg_scores: dict[str, Any] = {
+ k: np.mean([s[k] for s in scores]) for k in scores[0].keys()
+ }
+ avg_scores["scores_per_experiment"] = scores
+ return avg_scores
+
+ def _undersample_data(
+ self, dataset_split, label_column_name, samples_per_label, idxs=None
+ ):
+ """Undersample data to have samples_per_label samples of each label
+ without loading all images into memory.
+ """
+ if idxs is None:
+ idxs = np.arange(len(dataset_split))
+ np.random.shuffle(idxs)
+ if not isinstance(idxs, list):
+ idxs = idxs.tolist()
+ label_counter = defaultdict(int)
+ selected_indices = []
+
+ labels = dataset_split[label_column_name]
+ for i in idxs:
+ label = labels[i]
+ if label_counter[label] < samples_per_label:
+ selected_indices.append(i)
+ label_counter[label] += 1
+
+ undersampled_dataset = dataset_split.select(selected_indices)
+ return (
+ undersampled_dataset,
+ idxs,
+ )
diff --git a/mteb/abstasks/Image/AbsTaskImageClustering.py b/mteb/abstasks/Image/AbsTaskImageClustering.py
new file mode 100644
index 0000000000..8152bf10f7
--- /dev/null
+++ b/mteb/abstasks/Image/AbsTaskImageClustering.py
@@ -0,0 +1,56 @@
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from datasets import Dataset
+
+from mteb.abstasks.TaskMetadata import HFSubset
+
+from ...encoder_interface import Encoder
+from ...evaluation.evaluators import ImageClusteringEvaluator
+from ..AbsTask import AbsTask, ScoresDict
+
+logger = logging.getLogger(__name__)
+
+
+class AbsTaskImageClustering(AbsTask):
+ """Abstract class for Clustering tasks
+ The similarity is computed between pairs and the results are ranked.
+
+ self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns:
+ image: Image.Image
+ label: int
+ """
+
+ image_column_name: str = "image"
+ label_column_name: str = "label"
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+
+ def _add_main_score(self, scores: dict[HFSubset, ScoresDict]) -> None:
+ scores["main_score"] = scores[self.metadata.main_score]
+
+ def _calculate_metrics_from_split(
+ self, split: str, hf_subset: str | None = None, compute_overall: bool = False
+ ):
+ pass
+
+ def _evaluate_subset(
+ self,
+ model: Encoder,
+ dataset: Dataset,
+ *,
+ encode_kwargs: dict[str, Any] = {},
+ **kwargs,
+ ) -> ScoresDict:
+ evaluator = ImageClusteringEvaluator(
+ dataset[self.image_column_name],
+ dataset[self.label_column_name],
+ task_name=self.metadata.name,
+ **kwargs,
+ )
+ metrics = evaluator(model, encode_kwargs=encode_kwargs)
+ self._add_main_score(metrics)
+ return metrics
diff --git a/mteb/abstasks/Image/AbsTaskImageMultilabelClassification.py b/mteb/abstasks/Image/AbsTaskImageMultilabelClassification.py
new file mode 100644
index 0000000000..dc779d5e69
--- /dev/null
+++ b/mteb/abstasks/Image/AbsTaskImageMultilabelClassification.py
@@ -0,0 +1,216 @@
+from __future__ import annotations
+
+import itertools
+import logging
+from collections import defaultdict
+from typing import Any
+
+import numpy as np
+from sklearn.base import ClassifierMixin, clone
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import f1_score, label_ranking_average_precision_score
+from sklearn.model_selection import train_test_split
+from sklearn.multioutput import MultiOutputClassifier
+from sklearn.preprocessing import MultiLabelBinarizer
+
+from mteb.abstasks.TaskMetadata import HFSubset
+
+from ...encoder_interface import Encoder
+from ..AbsTask import AbsTask, ScoresDict
+
+logger = logging.getLogger(__name__)
+
+
+def evaluate_classifier(
+ embeddings_train: np.ndarray,
+ y_train: np.ndarray,
+ embeddings_test: np.ndarray,
+ y_test: np.ndarray,
+ classifier: ClassifierMixin,
+):
+ scores = {}
+ classifier = clone(classifier)
+ classifier.fit(embeddings_train, y_train)
+ y_pred = classifier.predict(embeddings_test)
+ accuracy = classifier.score(embeddings_test, y_test)
+ f1 = f1_score(y_test, y_pred, average="macro")
+ scores["accuracy"] = accuracy
+ scores["f1"] = f1
+ all_probs = []
+ for estimator in classifier.estimators_:
+ probs = estimator.predict_proba(embeddings_test)[:, 1]
+ all_probs.append(probs)
+
+ y_score = np.stack(all_probs, axis=1) # shape: (n_samples, n_labels)
+ lrap = label_ranking_average_precision_score(y_test, y_score)
+ scores["lrap"] = lrap
+ return scores
+
+
+class AbsTaskImageMultilabelClassification(AbsTask):
+ """Abstract class for image multioutput classification tasks
+ The similarity is computed between pairs and the results are ranked.
+
+ self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns:
+ image: list[PIL.Image]
+ labels: list[Hashable]
+ """
+
+ image_column_name: str = "image"
+ label_column_name: str = "labels"
+
+ classifier = MultiOutputClassifier(estimator=LogisticRegression())
+
+ def __init__(
+ self,
+ n_experiments=None,
+ samples_per_label=None,
+ batch_size=32,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.batch_size = batch_size
+
+ # Bootstrap parameters
+ self.n_experiments = n_experiments or getattr(self, "n_experiments", 10)
+ self.samples_per_label = samples_per_label or getattr(
+ self, "samples_per_label", 8
+ )
+ # Run metadata validation by instantiating addressing the attribute
+ # This is quite hacky. Ideally, this would be done in the constructor of
+ # each concrete task, but then we have to duplicate the __init__ method's
+ # interface.
+ if hasattr(self, "metadata"):
+ self.metadata
+
+ def _add_main_score(self, scores):
+ scores["main_score"] = scores[self.metadata.main_score]
+
+ def _calculate_metrics_from_split(
+ self, split: str, hf_subset: str | None = None, compute_overall: bool = False
+ ):
+ pass
+
+ def evaluate(
+ self,
+ model: Encoder,
+ eval_split: str = "test",
+ train_split: str = "train",
+ *,
+ encode_kwargs: dict[str, Any] = {},
+ **kwargs: Any,
+ ) -> dict[HFSubset, ScoresDict]:
+ if not self.data_loaded:
+ self.load_data()
+
+ scores = {}
+ hf_subsets = list(self.dataset) if self.is_multilingual else ["default"]
+
+ for hf_subset in hf_subsets:
+ logger.info(
+ f"\nTask: {self.metadata.name}, split: {eval_split}, subset: {hf_subset}. Running..."
+ )
+
+ if hf_subset not in self.dataset and hf_subset == "default":
+ ds = self.dataset
+ else:
+ ds = self.dataset[hf_subset]
+ scores[hf_subset] = self._evaluate_subset(
+ model,
+ ds,
+ eval_split,
+ train_split,
+ encode_kwargs=encode_kwargs,
+ **kwargs,
+ )
+ self._add_main_score(scores[hf_subset])
+
+ return scores
+
+ def _evaluate_subset(
+ self,
+ model: Encoder,
+ dataset,
+ eval_split: str = "test",
+ train_split: str = "train",
+ *,
+ encode_kwargs: dict[str, Any] = {},
+ **kwargs: Any,
+ ) -> ScoresDict:
+ train_split = dataset[train_split]
+ eval_split = dataset[eval_split]
+ params = {
+ "classifier_type": type(self.classifier).__name__,
+ "classifier_params": self.classifier.get_params(),
+ "batch_size": self.batch_size,
+ }
+ params.update(kwargs)
+
+ scores = []
+ # Bootstrap sample indices from training set for each experiment
+ train_samples = []
+ for _ in range(self.n_experiments):
+ sample_indices, _ = self._undersample_data_indices(
+ train_split[self.label_column_name], self.samples_per_label, None
+ )
+ train_samples.append(sample_indices)
+ # Encode all unique images at the indices
+ unique_train_indices = list(set(itertools.chain.from_iterable(train_samples)))
+ unique_train_images = train_split.select(unique_train_indices)[
+ self.image_column_name
+ ]
+
+ _unique_train_embeddings = model.get_image_embeddings(
+ unique_train_images,
+ **encode_kwargs,
+ )
+ unique_train_embeddings = dict(
+ zip(unique_train_indices, _unique_train_embeddings)
+ )
+ test_images = eval_split[self.image_column_name]
+ binarizer = MultiLabelBinarizer()
+ y_test = binarizer.fit_transform(eval_split[self.label_column_name])
+ # Stratified subsampling of test set to 2000 examples.
+ try:
+ if len(test_images) > 2000:
+ test_images, _, y_test, _ = train_test_split(
+ test_images, y_test, stratify=y_test, train_size=2000
+ )
+ except ValueError:
+ logger.warning("Couldn't subsample, continuing with the entire test set.")
+
+ X_test = model.get_image_embeddings(test_images, **encode_kwargs)
+ for i_experiment, sample_indices in enumerate(train_samples):
+ logger.info(
+ "=" * 10
+ + f" Experiment {i_experiment+1}/{self.n_experiments} "
+ + "=" * 10
+ )
+ X_train = np.stack([unique_train_embeddings[idx] for idx in sample_indices])
+ y_train = train_split.select(sample_indices)[self.label_column_name]
+ y_train = binarizer.transform(y_train)
+ scores_exp = evaluate_classifier(
+ X_train, y_train, X_test, y_test, self.classifier
+ )
+ scores.append(scores_exp)
+
+ avg_scores: dict[str, Any] = {
+ k: np.mean([s[k] for s in scores]) for k in scores[0].keys()
+ }
+ avg_scores["scores_per_experiment"] = scores
+
+ return avg_scores
+
+ def _undersample_data_indices(self, y, samples_per_label, idxs=None):
+ """Undersample data to have samples_per_label samples of each label"""
+ sample_indices = []
+ if idxs is None:
+ idxs = np.arange(len(y))
+ np.random.shuffle(idxs)
+ label_counter = defaultdict(int)
+ for i in idxs:
+ if any((label_counter[label] < samples_per_label) for label in y[i]):
+ sample_indices.append(i)
+ for label in y[i]:
+ label_counter[label] += 1
+ return sample_indices, idxs
diff --git a/mteb/abstasks/Image/AbsTaskImageTextPairClassification.py b/mteb/abstasks/Image/AbsTaskImageTextPairClassification.py
new file mode 100644
index 0000000000..b635610127
--- /dev/null
+++ b/mteb/abstasks/Image/AbsTaskImageTextPairClassification.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from datasets import Dataset
+
+from ...encoder_interface import Encoder
+from ...evaluation.evaluators import ImageTextPairClassificationEvaluator
+from ..AbsTask import AbsTask, ScoresDict
+
+logger = logging.getLogger(__name__)
+
+
+class AbsTaskImageTextPairClassification(AbsTask):
+ """Abstract class for Image Text Pair Classification tasks,
+ e.g. Compositionality evaluation.
+ The similarity is computed between pairs and the results are ranked.
+ Note that the number of images and the number of captions can be different.
+
+ self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns:
+ images: List[List[Image.Image]]
+ captions: List[List[str]]
+ """
+
+ # it can be ["image_0", "image_1"]; ["text_0", "text_1"] for datasets like WinoGround
+ images_column_names: str | list[str] = "image"
+ texts_column_names: str | list[str] = "caption"
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+
+ def _add_main_score(self, scores) -> None:
+ scores["main_score"] = scores[self.metadata.main_score]
+
+ def _calculate_metrics_from_split(
+ self, split: str, hf_subset: str | None = None, compute_overall: bool = False
+ ):
+ pass
+
+ def _evaluate_subset(
+ self,
+ model: Encoder,
+ dataset: Dataset,
+ *,
+ encode_kwargs: dict[str, Any] = {},
+ **kwargs,
+ ) -> ScoresDict:
+ evaluator = ImageTextPairClassificationEvaluator(
+ dataset,
+ images_column_names=self.images_column_names,
+ texts_column_names=self.texts_column_names,
+ task_name=self.metadata.name,
+ **kwargs,
+ )
+ scores = evaluator(model, encode_kwargs=encode_kwargs)
+ self._add_main_score(scores)
+ return scores
diff --git a/mteb/abstasks/Image/AbsTaskVisualSTS.py b/mteb/abstasks/Image/AbsTaskVisualSTS.py
new file mode 100644
index 0000000000..45de465eac
--- /dev/null
+++ b/mteb/abstasks/Image/AbsTaskVisualSTS.py
@@ -0,0 +1,87 @@
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from ...evaluation.evaluators import VisualSTSEvaluator
+from ..AbsTask import AbsTask, DescriptiveStatistics, ScoresDict
+
+logger = logging.getLogger(__name__)
+
+
+class VisualSTSDescriptiveStatistics(DescriptiveStatistics):
+ """Descriptive statistics for STS
+
+ Attributes:
+ num_samples: number of samples in the dataset
+ avg_score: Average score
+ """
+
+ # TODO: what are useful stats for visual STS tasks?
+ # average_pixel_width; average_pixel_height; average non-white boxes?
+
+ num_samples: int
+ avg_score: float
+
+
+class AbsTaskVisualSTS(AbsTask):
+ """Abstract class for visual STS experiments.
+
+ self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns:
+ sentence1: PIL.Image
+ sentence2: PIL.Image
+ score: float
+ """
+
+ sentences_column_names = ["sentence1", "sentence2"]
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+
+ @property
+ def min_score(self) -> int:
+ return self.metadata_dict["min_score"]
+
+ @property
+ def max_score(self) -> int:
+ return self.metadata_dict["max_score"]
+
+ def _evaluate_subset(
+ self, model, data_split, *, encode_kwargs: dict[str, Any] = {}, **kwargs
+ ) -> ScoresDict:
+ def normalize(x):
+ return (x - self.min_score) / (self.max_score - self.min_score)
+
+ normalized_scores = list(map(normalize, data_split["score"]))
+ evaluator = VisualSTSEvaluator(
+ data_split,
+ self.sentences_column_names,
+ normalized_scores,
+ task_name=self.metadata.name,
+ **kwargs,
+ )
+ scores = evaluator(model, encode_kwargs=encode_kwargs)
+
+ self._add_main_score(scores)
+ return scores
+
+ def _add_main_score(self, scores: ScoresDict) -> None:
+ scores["main_score"] = scores[self.metadata.main_score]
+
+ def _calculate_metrics_from_split(
+ self, split: str, hf_subset: str | None = None, compute_overall: bool = False
+ ) -> VisualSTSDescriptiveStatistics:
+ if hf_subset:
+ score = self.dataset[hf_subset][split]["score"]
+ elif compute_overall:
+ score = []
+ for hf_subset in self.metadata.eval_langs:
+ score.extend(self.dataset[hf_subset][split]["score"])
+ else:
+ score = self.dataset[split]["score"]
+
+ avg_score = sum(score) / len(score)
+ return VisualSTSDescriptiveStatistics(
+ num_samples=len(score),
+ avg_score=avg_score,
+ )
diff --git a/mteb/abstasks/Image/AbsTaskZeroshotClassification.py b/mteb/abstasks/Image/AbsTaskZeroshotClassification.py
new file mode 100644
index 0000000000..36bdd27103
--- /dev/null
+++ b/mteb/abstasks/Image/AbsTaskZeroshotClassification.py
@@ -0,0 +1,64 @@
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from datasets import Dataset
+
+from ...encoder_interface import Encoder
+from ...evaluation.evaluators import ZeroshotClassificationEvaluator
+from ..AbsTask import AbsTask, ScoresDict
+
+logger = logging.getLogger(__name__)
+
+
+class AbsTaskZeroshotClassification(AbsTask):
+ """Abstract class for ZeroshotClassification tasks
+ The similarity between an images and candidate text prompts, such as this is a dog/this is a cat.
+
+ self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns:
+ image: list of Image.Image
+ labels: list of int
+ """
+
+ image_column_name: str = "image"
+ label_column_name: str = "label"
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+
+ def _add_main_score(self, scores) -> None:
+ scores["main_score"] = scores[self.metadata.main_score]
+
+ def _calculate_metrics_from_split(
+ self, split: str, hf_subset: str | None = None, compute_overall: bool = False
+ ):
+ pass
+
+ def _evaluate_subset(
+ self,
+ model: Encoder,
+ dataset: Dataset,
+ *,
+ encode_kwargs: dict[str, Any] = {},
+ **kwargs,
+ ) -> ScoresDict:
+ candidate_labels = self.get_candidate_labels()
+ evaluator = ZeroshotClassificationEvaluator(
+ dataset,
+ self.image_column_name,
+ # dataset[self.image_column_name],#broken into dataset and self.image_column_name
+ dataset[self.label_column_name],
+ candidate_labels,
+ task_name=self.metadata.name,
+ **kwargs,
+ )
+ metrics = evaluator(model, encode_kwargs=encode_kwargs)
+
+ scores = {"accuracy": metrics["accuracy"]}
+ self._add_main_score(scores)
+ return scores
+
+ def get_candidate_labels(self) -> list[str]:
+ """Return the text candidates for zeroshot classification"""
+ raise NotImplementedError("This method should be overridden by subclasses")
diff --git a/mteb/abstasks/Image/__init__.py b/mteb/abstasks/Image/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py
index 3f115b2dbb..24b3c9fa23 100644
--- a/mteb/abstasks/TaskMetadata.py
+++ b/mteb/abstasks/TaskMetadata.py
@@ -24,6 +24,7 @@
path_to_lang_codes,
path_to_lang_scripts,
)
+from ..modalities import MODALITIES
TASK_SUBTYPE = Literal[
"Article retrieval",
@@ -31,6 +32,7 @@
"Dialect pairing",
"Dialog Systems",
"Discourse coherence",
+ "Duplicate Image Retrieval",
"Language identification",
"Linguistic acceptability",
"Political classification",
@@ -47,6 +49,15 @@
"Counterfactual Detection",
"Emotion classification",
"Reasoning as Retrieval",
+ "Rendered Texts Understanding",
+ "Image Text Retrieval",
+ "Object recognition",
+ "Scene recognition",
+ "Caption Pairing",
+ "Emotion recognition",
+ "Textures recognition",
+ "Activity recognition",
+ "Tumor detection",
"Duplicate Detection",
]
@@ -64,6 +75,7 @@
"Poetry",
"Religious",
"Reviews",
+ "Scene",
"Social",
"Spoken",
"Subtitles",
@@ -72,6 +84,8 @@
"Programming",
"Chemistry",
"Financial",
+ "Chemistry",
+ "Financial",
]
SAMPLE_CREATION_METHOD = Literal[
@@ -83,6 +97,7 @@
"machine-translated and verified",
"machine-translated and localized",
"LM-generated and verified",
+ "rendered",
"multiple",
]
TASK_TYPE = Literal[
@@ -97,6 +112,15 @@
"Summarization",
"InstructionRetrieval",
"Speed",
+ "Any2AnyMultiChoice",
+ "Any2AnyRetrieval",
+ "Any2TextMutipleChoice",
+ "ImageClustering",
+ "ImageClassification",
+ "ImageMultilabelClassification",
+ "ImageTextPairClassification",
+ "VisualSTS",
+ "ZeroShotClassification",
]
@@ -104,6 +128,15 @@
"s2s", # Sentence-to-sentence
"s2p", # Sentence-to-paragraph
"p2p", # Paragraph-to-paragraph
+ "t2t", # specifically for text-only tasks in mieb
+ "i2i", # image-to-image
+ "i2t", # image-to-text
+ "t2i", # text-to-image
+ "it2t", # image+text-to-text
+ "it2i", # image+text-to-image
+ "i2it", # image-to-image+text
+ "t2it", # text-to-image+text
+ "it2it", # image+text-to-image+text
]
ANNOTATOR_TYPE = Literal[
@@ -175,7 +208,7 @@
"multiple",
]
)
-MODALITIES = Literal["text"]
+
METRIC_NAME = str
METRIC_VALUE = Union[int, float, dict[str, Any]]
@@ -198,6 +231,9 @@ class DescriptiveStatistics(TypedDict):
pass
+METRIC_VALUE = Union[int, float, dict[str, Any]]
+
+
logger = logging.getLogger(__name__)
@@ -223,15 +259,18 @@ class TaskMetadata(BaseModel):
"Government", "Legal", "Medical", "Poetry", "Religious", "Reviews", "Web", "Spoken", "Written". A dataset can belong to multiple domains.
task_subtypes: The subtypes of the task. E.g. includes "Sentiment/Hate speech", "Thematic Clustering". Feel free to update the list as needed.
license: The license of the data specified as lowercase, e.g. "cc-by-nc-4.0". If the license is not specified, use "not specified". For custom licenses a URL is used.
+ license: The license of the data specified as lowercase, e.g. "cc-by-nc-4.0". If the license is not specified, use "not specified". For custom licenses a URL is used.
annotations_creators: The type of the annotators. Includes "expert-annotated" (annotated by experts), "human-annotated" (annotated e.g. by
mturkers), "derived" (derived from structure in the data).
dialect: The dialect of the data, if applicable. Ideally specified as a BCP-47 language tag. Empty list if no dialects are present.
sample_creation: The method of text creation. Includes "found", "created", "machine-translated", "machine-translated and verified", and
"machine-translated and localized".
prompt: The prompt used for the task. Can be a string or a dictionary containing the query and passage prompts.
+ prompt: The prompt used for the task. Can be a string or a dictionary containing the query and passage prompts.
bibtex_citation: The BibTeX citation for the dataset. Should be an empty string if no citation is available.
"""
+ dataset: dict[str, Any]
dataset: dict[str, Any]
name: str
diff --git a/mteb/abstasks/__init__.py b/mteb/abstasks/__init__.py
index ef3e8853d7..c874bd2214 100644
--- a/mteb/abstasks/__init__.py
+++ b/mteb/abstasks/__init__.py
@@ -13,4 +13,13 @@
from .AbsTaskSpeedTask import *
from .AbsTaskSTS import *
from .AbsTaskSummarization import *
+from .Image.AbsTaskAny2AnyMultiChoice import *
+from .Image.AbsTaskAny2AnyRetrieval import *
+from .Image.AbsTaskAny2TextMultipleChoice import *
+from .Image.AbsTaskImageClassification import *
+from .Image.AbsTaskImageClustering import *
+from .Image.AbsTaskImageMultilabelClassification import *
+from .Image.AbsTaskImageTextPairClassification import *
+from .Image.AbsTaskVisualSTS import *
+from .Image.AbsTaskZeroshotClassification import *
from .MultilingualTask import *
diff --git a/mteb/create_meta.py b/mteb/create_meta.py
index bf7402613d..ea4bf9c952 100644
--- a/mteb/create_meta.py
+++ b/mteb/create_meta.py
@@ -82,7 +82,7 @@ def get_task_results(results_folder: Path) -> list[TaskResult]:
json_files = [
r
for r in results_folder.glob("*.json")
- if r.is_file() and r.name != "model_meta.json"
+ if r.is_file() and r.name != "model_meta.json" and "predictions" not in r.name
]
task_results = [TaskResult.from_disk(path) for path in json_files]
task_results = [
diff --git a/mteb/encoder_interface.py b/mteb/encoder_interface.py
index 1fac3a9405..5a66330cdc 100644
--- a/mteb/encoder_interface.py
+++ b/mteb/encoder_interface.py
@@ -6,6 +6,8 @@
import numpy as np
import torch
+from PIL import Image
+from torch.utils.data import DataLoader
Corpus = Union[list[dict[str, str]], dict[str, list[str]]]
@@ -29,6 +31,7 @@ def __init__(self, device: str | None = None) -> None:
Args:
device: The device to use for encoding. Can be ignored if the encoder is not using a device (e.g. for API)
"""
+ self.device = device
def encode(
self,
@@ -150,3 +153,55 @@ def convert_conv_history_to_query(conversations: Sequence[Sequence[str]]) -> str
The query.
"""
...
+
+
+class ImageEncoder:
+ """Interface for image encoder designed based on VLM2VecWrapper.
+ There is not a perfect 1-1 match, e.g. device can be None here.
+ The intention here is to define the current interface and adapt to as close to MTEB as possible
+ and align as much as possible with sentencetransformers.
+ """
+
+ def __init__(
+ self,
+ device: str | None,
+ **kwargs: Any,
+ ):
+ pass
+
+ def encode( # current a 1-1 match with Encoder.encode
+ self,
+ sentences: Sequence[str],
+ *,
+ task_name: str,
+ prompt_type: PromptType | None = None,
+ **kwargs: Any,
+ ) -> np.ndarray:
+ pass
+
+ def get_image_embeddings( # Seems like sentence transformers use a singular encode for both images and text. Not sure if we want to do the same.
+ # If not it might be ideal to redefine Encoder.encode
+ self,
+ images: list[Image.Image] | DataLoader,
+ **kwargs,
+ # removed batch_size, it is not required that it will accept kwargs
+ ) -> np.ndarray: # added standard output (I believe we actually expect tensors in the code, but would like to be consistent)
+ pass
+
+ def get_text_embeddings( # any reason for this?
+ self,
+ texts: list[str],
+ **kwargs,
+ ) -> np.ndarray:
+ pass
+
+ def get_fused_embeddings( # hmm what if I have a document with images at specific positions?
+ self,
+ texts: list[str] | None = None,
+ images: list[Image.Image]
+ | DataLoader
+ | None = None, # the requirement for these two to be the same seems odd (docs without images, images without associated text, docs with multiple images)
+ # fusion_mode: str="sum", # will remove this as it should be required in the interface
+ **kwargs: Any,
+ ) -> np.ndarray:
+ pass
diff --git a/mteb/evaluation/evaluators/Image/Any2AnyMultiChoiceEvaluator.py b/mteb/evaluation/evaluators/Image/Any2AnyMultiChoiceEvaluator.py
new file mode 100644
index 0000000000..5fdbb112f3
--- /dev/null
+++ b/mteb/evaluation/evaluators/Image/Any2AnyMultiChoiceEvaluator.py
@@ -0,0 +1,487 @@
+from __future__ import annotations
+
+import heapq
+import io
+import json
+import logging
+import math
+import os
+from collections import defaultdict
+from typing import Any
+
+import numpy as np
+import pytrec_eval
+import torch
+from datasets import Dataset
+from PIL import Image
+from torch.utils.data import DataLoader
+from torchvision import transforms
+
+from mteb.encoder_interface import Encoder
+
+from ..Evaluator import Evaluator
+from ..utils import (
+ confidence_scores,
+ cos_sim,
+ dot_score,
+ download,
+ hole,
+ mrr,
+ nAUC,
+ recall_cap,
+ top_k_accuracy,
+)
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+logger = logging.getLogger(__name__)
+
+transform = transforms.Compose([transforms.PILToTensor()])
+
+
+class ImageDataset(torch.utils.data.Dataset):
+ def __init__(self, hf_dataset, image_column_name: str = "image", transform=None):
+ self.dataset = hf_dataset
+ self.transform = transform
+ self.image_column_name = image_column_name
+
+ def __len__(self):
+ return len(self.dataset)
+
+ def __getitem__(self, idx):
+ image = self.dataset[idx][self.image_column_name]
+ if isinstance(image, bytes):
+ image = Image.open(io.BytesIO(image))
+ else:
+ # Assume the image is already in a usable format (e.g., PIL Image)
+ image = image
+ if image.mode != "RGB":
+ image = image.convert("RGB")
+ image = self.transform(image)
+ return image
+
+
+def custom_collate_fn(batch):
+ return batch
+
+
+# Adapted from https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/retrieval/search/dense/exact_search.py#L12
+class Any2AnyMultiChoiceSearch:
+ def __init__(
+ self,
+ model: Encoder,
+ encode_kwargs: dict[str, Any] = {},
+ corpus_chunk_size: int = 20000,
+ previous_results: str | None = None,
+ **kwargs: Any,
+ ):
+ # Model is class that provides get_text_embeddings() and get_image_embeddings()
+ self.model = model
+ self.encode_kwargs = encode_kwargs
+
+ if "batch_size" not in encode_kwargs:
+ encode_kwargs["batch_size"] = 128
+
+ self.score_functions = {"cos_sim": cos_sim, "dot": dot_score}
+ self.score_function_desc = {
+ "cos_sim": "Cosine Similarity",
+ "dot": "Dot Product",
+ }
+ self.corpus_chunk_size = corpus_chunk_size
+ self.previous_results = previous_results
+ self.batch_size = encode_kwargs.get("batch_size")
+ self.show_progress_bar = encode_kwargs.get("show_progress_bar")
+ self.save_corpus_embeddings = kwargs.get("save_corpus_embeddings", False)
+ self.corpus_embeddings = defaultdict(list)
+ self.results = {}
+
+ if self.previous_results is not None:
+ self.previous_results = self.load_results_file()
+
+ def search(
+ self,
+ corpus: Dataset, # solve memoery issues
+ queries: Dataset, # solve memoery issues
+ qrels: Dataset,
+ top_k: int,
+ score_function: str,
+ return_sorted: bool = False,
+ **kwargs,
+ ) -> dict[str, dict[str, float]]:
+ if score_function not in self.score_functions:
+ raise ValueError(
+ f"score function: {score_function} must be either (cos_sim) for cosine similarity or (dot) for dot product"
+ )
+
+ logger.info("Encoding Queries.")
+ query_ids = list(queries["id"])
+ self.results = {qid: {} for qid in query_ids}
+
+ q_modality = queries[0]["modality"]
+
+ if q_modality == "text":
+ query_texts = queries["text"]
+ query_embeddings = self.model.get_text_embeddings(
+ texts=query_texts, batch_size=self.encode_kwargs["batch_size"]
+ )
+ else:
+ queries_dataset = ImageDataset(
+ queries, image_column_name="image", transform=transform
+ )
+ query_image_dataloader = DataLoader(
+ queries_dataset,
+ batch_size=self.encode_kwargs["batch_size"],
+ shuffle=False,
+ collate_fn=custom_collate_fn,
+ num_workers=min(math.floor(os.cpu_count() / 2), 16),
+ )
+ if q_modality == "image":
+ query_embeddings = self.model.get_image_embeddings(
+ images=query_image_dataloader,
+ batch_size=self.encode_kwargs["batch_size"],
+ )
+ elif q_modality == "image,text":
+ query_texts = queries["text"]
+ query_embeddings = self.model.get_fused_embeddings(
+ texts=query_texts,
+ images=query_image_dataloader,
+ batch_size=self.encode_kwargs["batch_size"],
+ )
+ else:
+ raise ValueError(f"Unsupported modality: {q_modality}")
+
+ logger.info("Preparing Corpus...")
+ corpus_ids = list(corpus["id"])
+
+ corpus_modality = corpus[0]["modality"]
+
+ logger.info("Encoding Corpus in batches... Warning: This might take a while!")
+ logger.info(
+ f"Scoring Function: {self.score_function_desc[score_function]} ({score_function})"
+ )
+
+ result_heaps = {qid: [] for qid in query_ids}
+ for chunk_start in range(0, len(corpus), self.corpus_chunk_size):
+ chunk = corpus.select(
+ range(
+ chunk_start, min(chunk_start + self.corpus_chunk_size, len(corpus))
+ )
+ )
+ chunk_ids = corpus_ids[chunk_start : chunk_start + self.corpus_chunk_size]
+
+ if corpus_modality == "text":
+ corpus_texts = chunk["text"]
+ sub_corpus_embeddings = self.model.get_text_embeddings(
+ texts=corpus_texts, batch_size=self.encode_kwargs["batch_size"]
+ )
+ else:
+ corpus_dataset = ImageDataset(
+ chunk, image_column_name="image", transform=transform
+ )
+ corpus_image_dataloader = DataLoader(
+ corpus_dataset,
+ batch_size=self.encode_kwargs["batch_size"],
+ shuffle=False,
+ collate_fn=custom_collate_fn,
+ num_workers=min(math.floor(os.cpu_count() / 2), 16),
+ )
+ if corpus_modality == "image":
+ sub_corpus_embeddings = self.model.get_image_embeddings(
+ images=corpus_image_dataloader,
+ batch_size=self.encode_kwargs["batch_size"],
+ )
+ elif corpus_modality == "image,text":
+ corpus_texts = chunk["text"]
+ sub_corpus_embeddings = self.model.get_fused_embeddings(
+ texts=corpus_texts,
+ images=corpus_image_dataloader,
+ batch_size=self.encode_kwargs["batch_size"],
+ )
+ else:
+ raise ValueError(f"Unsupported modality: {corpus_modality}")
+
+ cos_scores = self.score_functions[score_function](
+ query_embeddings, sub_corpus_embeddings
+ )
+ cos_scores[torch.isnan(cos_scores)] = -1
+
+ for query_idx in range(len(query_embeddings)):
+ query_id = query_ids[query_idx]
+ # discount answers which aren't a multiple choice (where there is a qrel entry for both query and corpus id)
+ for c_idx, c_id in enumerate(chunk_ids):
+ if c_id not in qrels[query_id]:
+ cos_scores[query_idx, c_idx] = -1
+
+ cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(
+ cos_scores,
+ min(top_k, cos_scores.size(1)),
+ dim=1,
+ largest=True,
+ sorted=return_sorted,
+ )
+ cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
+ cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist()
+
+ for query_itr in range(len(query_embeddings)):
+ query_id = query_ids[query_itr]
+ for sub_corpus_id, score in zip(
+ cos_scores_top_k_idx[query_itr], cos_scores_top_k_values[query_itr]
+ ):
+ corpus_id = chunk_ids[sub_corpus_id]
+ if len(result_heaps[query_id]) < top_k:
+ heapq.heappush(result_heaps[query_id], (score, corpus_id))
+ else:
+ heapq.heappushpop(result_heaps[query_id], (score, corpus_id))
+
+ for qid in result_heaps:
+ for score, corpus_id in result_heaps[qid]:
+ self.results[qid][corpus_id] = score
+
+ return self.results
+
+ def load_results_file(self):
+ # load the first stage results from file in format {qid: {doc_id: score}}
+ if "https://" in self.previous_results:
+ # download the file
+ if not os.path.exists(self.previous_results):
+ url_descriptor = self.previous_results.split("https://")[-1].replace(
+ "/", "--"
+ )
+ dest_file = os.path.join(
+ "results", f"cached_predictions--{url_descriptor}"
+ )
+ os.makedirs(os.path.dirname(os.path.abspath(dest_file)), exist_ok=True)
+ download(self.previous_results, dest_file)
+ logger.info(
+ f"Downloaded the previous results at {self.previous_results} to {dest_file}"
+ )
+ self.previous_results = dest_file
+
+ with open(self.previous_results) as f:
+ previous_results = json.load(f)
+ assert isinstance(previous_results, dict)
+ assert isinstance(previous_results[list(previous_results.keys())[0]], dict)
+ return previous_results
+
+
+class Any2AnyMultiChoiceEvaluator(Evaluator):
+ def __init__(
+ self,
+ retriever=None,
+ task_name: str | None = None,
+ k_values: list[int] = [1, 3, 5, 10, 20, 100, 1000],
+ score_function: str = "cos_sim",
+ encode_kwargs: dict[str, Any] = {},
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ self.retriever = Any2AnyMultiChoiceSearch(
+ retriever, encode_kwargs=encode_kwargs, **kwargs
+ )
+ self.k_values = k_values
+ self.top_k = (
+ max(k_values) if "top_k" not in kwargs else kwargs["top_k"]
+ ) # can lower it if reranking
+ self.score_function = score_function
+ self.task_name = task_name
+
+ def __call__(
+ self,
+ corpus: dict[str, dict[str, str | Image.Image]],
+ queries: dict[str, dict[str, str | Image.Image]],
+ qrels: dict[str, dict[str, int]],
+ ) -> dict[str, dict[str, float]]:
+ if not self.retriever:
+ raise ValueError("Model/Technique has not been provided!")
+
+ return self.retriever.search(
+ corpus,
+ queries,
+ qrels,
+ self.top_k,
+ self.score_function,
+ prompt_name=self.task_name, # type: ignore
+ )
+
+ @staticmethod
+ def evaluate(
+ qrels: dict[str, dict[str, int]],
+ results: dict[str, dict[str, float]],
+ k_values: list[int],
+ ignore_identical_ids: bool = False,
+ skip_first_result: bool = False,
+ ) -> tuple[
+ dict[str, float],
+ dict[str, float],
+ dict[str, float],
+ dict[str, float],
+ dict[str, float],
+ ]:
+ if ignore_identical_ids:
+ logger.debug(
+ "For evaluation, ``ignore_identical_ids=True`` is set to True, the evaluator will ignore identical query and document ids."
+ )
+ # Remove identical ids from results dict
+ for qid, rels in results.items():
+ for pid in list(rels):
+ if qid == pid:
+ results[qid].pop(pid)
+ else:
+ logger.debug(
+ "For evaluation, we DO NOT ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=True`` to ignore this."
+ )
+
+ all_ndcgs, all_aps, all_recalls, all_precisions, all_cv_recalls = (
+ {},
+ {},
+ {},
+ {},
+ {},
+ )
+
+ for k in k_values:
+ all_ndcgs[f"NDCG@{k}"] = []
+ all_aps[f"MAP@{k}"] = []
+ all_recalls[f"Recall@{k}"] = []
+ all_precisions[f"P@{k}"] = []
+ all_cv_recalls[f"CV_Recall@{k}"] = [] # (new) CV-style Recall
+
+ map_string = "map_cut." + ",".join([str(k) for k in k_values])
+ ndcg_string = "ndcg_cut." + ",".join([str(k) for k in k_values])
+ recall_string = "recall." + ",".join([str(k) for k in k_values])
+ precision_string = "P." + ",".join([str(k) for k in k_values])
+ evaluator = pytrec_eval.RelevanceEvaluator(
+ qrels, {map_string, ndcg_string, recall_string, precision_string}
+ )
+ scores = evaluator.evaluate(results)
+
+ sorted_results = {
+ qid: sorted(rels.items(), key=lambda item: item[1], reverse=True)
+ for qid, rels in results.items()
+ }
+
+ if skip_first_result:
+ for qid, rels in sorted_results.items():
+ sorted_results[qid].pop(0)
+
+ for query_id in scores.keys():
+ top_docs = [
+ doc_id for doc_id, _ in sorted_results.get(query_id, [])
+ ] # Sorted list of doc IDs
+ # we need to discount qrels that have a ground truth score of 0
+ relevant_docs = {
+ key
+ for key in qrels.get(query_id, {}).keys()
+ if qrels[query_id][key] != 0
+ }
+
+ for k in k_values:
+ top_k_docs = top_docs[:k]
+ all_ndcgs[f"NDCG@{k}"].append(scores[query_id]["ndcg_cut_" + str(k)])
+ all_aps[f"MAP@{k}"].append(scores[query_id]["map_cut_" + str(k)])
+ all_recalls[f"Recall@{k}"].append(scores[query_id]["recall_" + str(k)])
+ all_precisions[f"P@{k}"].append(scores[query_id]["P_" + str(k)])
+
+ if relevant_docs.intersection(top_k_docs):
+ all_cv_recalls[f"CV_Recall@{k}"].append(1.0)
+ else:
+ all_cv_recalls[f"CV_Recall@{k}"].append(0.0)
+
+ ndcg, _map, recall, precision, cv_recall = (
+ all_ndcgs.copy(),
+ all_aps.copy(),
+ all_recalls.copy(),
+ all_precisions.copy(),
+ all_cv_recalls.copy(),
+ )
+
+ for k in k_values:
+ ndcg[f"NDCG@{k}"] = round(sum(ndcg[f"NDCG@{k}"]) / len(scores), 5)
+ _map[f"MAP@{k}"] = round(sum(_map[f"MAP@{k}"]) / len(scores), 5)
+ recall[f"Recall@{k}"] = round(sum(recall[f"Recall@{k}"]) / len(scores), 5)
+ precision[f"P@{k}"] = round(sum(precision[f"P@{k}"]) / len(scores), 5)
+ cv_recall[f"CV_Recall@{k}"] = round(
+ sum(cv_recall[f"CV_Recall@{k}"]) / len(scores), 5
+ )
+
+ naucs = Any2AnyMultiChoiceEvaluator.evaluate_abstention(
+ results,
+ {**all_ndcgs, **all_aps, **all_recalls, **all_precisions, **all_cv_recalls},
+ )
+
+ return ndcg, _map, recall, precision, cv_recall, naucs
+
+ @staticmethod
+ def evaluate_custom(
+ qrels: dict[str, dict[str, int]],
+ results: dict[str, dict[str, float]],
+ k_values: list[int],
+ metric: str,
+ output_type: str = "all",
+ ) -> tuple[dict[str, float]]:
+ if metric.lower() in ["mrr", "mrr@k", "mrr_cut"]:
+ metric_scores = mrr(qrels, results, k_values, output_type)
+
+ elif metric.lower() in ["recall_cap", "r_cap", "r_cap@k"]:
+ metric_scores = recall_cap(qrels, results, k_values, output_type)
+
+ elif metric.lower() in ["hole", "hole@k"]:
+ metric_scores = hole(qrels, results, k_values, output_type)
+
+ elif metric.lower() in [
+ "acc",
+ "top_k_acc",
+ "accuracy",
+ "accuracy@k",
+ "top_k_accuracy",
+ ]:
+ metric_scores = top_k_accuracy(qrels, results, k_values, output_type)
+
+ naucs = Any2AnyMultiChoiceEvaluator.evaluate_abstention(results, metric_scores)
+ metric_scores_avg = {k: sum(v) / len(v) for k, v in metric_scores.items()}
+
+ return metric_scores_avg, naucs
+
+ @staticmethod
+ def evaluate_abstention(
+ results: dict[str, dict[str, float]],
+ metric_scores: dict[str, list[float]],
+ ) -> dict[str, float]:
+ """Computes normalized Area Under the Curve on a set of evaluated instances as presented in the paper https://arxiv.org/abs/2402.12997"""
+ all_sim_scores = [list(results[qid].values()) for qid in list(results.keys())]
+ all_conf_scores = [
+ confidence_scores(sim_scores) for sim_scores in all_sim_scores
+ ]
+ conf_fcts = list(all_conf_scores[0].keys())
+ all_conf_scores = {
+ fct: np.array([x[fct] for x in all_conf_scores]) for fct in conf_fcts
+ }
+ metric_scores = {k: np.array(v) for k, v in metric_scores.items()}
+ naucs = {}
+
+ for metric_name, scores in metric_scores.items():
+ for fct, conf_scores in all_conf_scores.items():
+ naucs[f"nAUC_{metric_name}_{fct}"] = nAUC(conf_scores, scores)
+
+ return naucs
+
+ @staticmethod
+ def calculate_cv_style_recall(
+ qrels: dict[str, dict[str, int]], results: dict[str, dict[str, float]], k: int
+ ) -> dict[str, float]:
+ """Calculate CV-style recall: Recall is 1 if any relevant document is
+ retrieved in the top k, otherwise 0.
+ """
+ cv_recalls = {}
+ for query_id, relevant_docs in qrels.items():
+ retrieved_docs = list(results.get(query_id, {}).keys())[
+ :k
+ ] # Retrieve top k documents
+ if any(doc_id in relevant_docs for doc_id in retrieved_docs):
+ cv_recalls[query_id] = (
+ 1.0 # If any relevant doc is found in top k, recall is 1
+ )
+ else:
+ cv_recalls[query_id] = 0.0 # Otherwise, recall is 0
+ return cv_recalls
diff --git a/mteb/evaluation/evaluators/Image/Any2AnyRetrievalEvaluator.py b/mteb/evaluation/evaluators/Image/Any2AnyRetrievalEvaluator.py
new file mode 100644
index 0000000000..777e3b545f
--- /dev/null
+++ b/mteb/evaluation/evaluators/Image/Any2AnyRetrievalEvaluator.py
@@ -0,0 +1,491 @@
+from __future__ import annotations
+
+import heapq
+import io
+import json
+import logging
+import math
+import os
+from collections import defaultdict
+from typing import Any
+
+import numpy as np
+import pytrec_eval
+import torch
+from datasets import Dataset
+from PIL import Image
+from torch.utils.data import DataLoader
+from torchvision import transforms
+
+from mteb.encoder_interface import Encoder, PromptType
+
+from ..Evaluator import Evaluator
+from ..utils import (
+ confidence_scores,
+ cos_sim,
+ dot_score,
+ download,
+ hole,
+ mrr,
+ nAUC,
+ recall_cap,
+ top_k_accuracy,
+)
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_TRANSFORM = transforms.Compose([transforms.PILToTensor()])
+
+
+class ImageDataset(torch.utils.data.Dataset):
+ def __init__(self, hf_dataset, image_column_name: str = "image", transform=None):
+ self.dataset = hf_dataset
+ self.transform = transform
+ self.image_column_name = image_column_name
+
+ def __len__(self):
+ return len(self.dataset)
+
+ def __getitem__(self, idx):
+ image = self.dataset[idx][self.image_column_name]
+ if isinstance(image, bytes):
+ image = Image.open(io.BytesIO(image))
+ else:
+ # Assume the image is already in a usable format (e.g., PIL Image)
+ image = image
+ if image.mode != "RGB":
+ image = image.convert("RGB")
+ if self.transform is not None:
+ image = self.transform(image)
+ return image
+
+
+def custom_collate_fn(batch):
+ return batch
+
+
+# Adapted from https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/retrieval/search/dense/exact_search.py#L12
+class Any2AnyDenseRetrievalExactSearch:
+ def __init__(
+ self,
+ model: Encoder,
+ encode_kwargs: dict[str, Any] = {},
+ corpus_chunk_size: int = 20000,
+ previous_results: str | None = None,
+ transform=DEFAULT_TRANSFORM,
+ **kwargs: Any,
+ ):
+ # Model is class that provides get_text_embeddings() and get_image_embeddings()
+ self.model = model
+ self.encode_kwargs = encode_kwargs
+ self.transform = transform
+
+ if "batch_size" not in encode_kwargs:
+ encode_kwargs["batch_size"] = 128
+
+ self.score_functions = {"cos_sim": cos_sim, "dot": dot_score}
+ self.score_function_desc = {
+ "cos_sim": "Cosine Similarity",
+ "dot": "Dot Product",
+ }
+ self.corpus_chunk_size = corpus_chunk_size
+ self.previous_results = previous_results
+ self.batch_size = encode_kwargs.get("batch_size")
+ self.show_progress_bar = encode_kwargs.get("show_progress_bar")
+ self.save_corpus_embeddings = kwargs.get("save_corpus_embeddings", False)
+ self.corpus_embeddings = defaultdict(list)
+ self.results = {}
+
+ if self.previous_results is not None:
+ self.previous_results = self.load_results_file()
+
+ def search(
+ self,
+ corpus: Dataset, # solve memoery issues
+ queries: Dataset, # solve memoery issues
+ top_k: int,
+ score_function: str,
+ task_name: str,
+ return_sorted: bool = False,
+ **kwargs,
+ ) -> dict[str, dict[str, float]]:
+ if score_function not in self.score_functions:
+ raise ValueError(
+ f"score function: {score_function} must be either (cos_sim) for cosine similarity or (dot) for dot product"
+ )
+
+ logger.info("Encoding Queries.")
+ query_ids = list(queries["id"])
+ self.results = {qid: {} for qid in query_ids}
+
+ q_modality = queries[0]["modality"]
+
+ if q_modality == "text":
+ query_texts = queries["text"]
+ query_embeddings = self.model.get_text_embeddings(
+ texts=query_texts,
+ task_name=task_name,
+ prompt_type=PromptType.query,
+ **self.encode_kwargs,
+ )
+ else:
+ queries_dataset = ImageDataset(
+ queries, image_column_name="image", transform=self.transform
+ )
+ query_image_dataloader = DataLoader(
+ queries_dataset,
+ batch_size=self.encode_kwargs["batch_size"],
+ shuffle=False,
+ collate_fn=custom_collate_fn,
+ num_workers=min(math.floor(os.cpu_count() / 2), 16),
+ )
+ if q_modality == "image":
+ query_embeddings = self.model.get_image_embeddings(
+ images=query_image_dataloader,
+ task_name=task_name,
+ prompt_type=PromptType.query,
+ **self.encode_kwargs,
+ )
+ elif q_modality == "image,text":
+ query_texts = queries["text"]
+ query_embeddings = self.model.get_fused_embeddings(
+ texts=query_texts,
+ images=query_image_dataloader,
+ task_name=task_name,
+ prompt_type=PromptType.query,
+ **self.encode_kwargs,
+ )
+ else:
+ raise ValueError(f"Unsupported modality: {q_modality}")
+
+ logger.info("Preparing Corpus...")
+ corpus_ids = list(corpus["id"])
+
+ corpus_modality = corpus[0]["modality"]
+
+ logger.info("Encoding Corpus in batches... Warning: This might take a while!")
+ logger.info(
+ f"Scoring Function: {self.score_function_desc[score_function]} ({score_function})"
+ )
+
+ result_heaps = {qid: [] for qid in query_ids}
+ for chunk_start in range(0, len(corpus), self.corpus_chunk_size):
+ chunk = corpus.select(
+ range(
+ chunk_start, min(chunk_start + self.corpus_chunk_size, len(corpus))
+ )
+ )
+ chunk_ids = corpus_ids[chunk_start : chunk_start + self.corpus_chunk_size]
+
+ if corpus_modality == "text":
+ corpus_texts = chunk["text"]
+ sub_corpus_embeddings = self.model.get_text_embeddings(
+ texts=corpus_texts,
+ task_name=task_name,
+ prompt_type=PromptType.passage,
+ **self.encode_kwargs,
+ )
+ else:
+ corpus_dataset = ImageDataset(
+ chunk, image_column_name="image", transform=self.transform
+ )
+ corpus_image_dataloader = DataLoader(
+ corpus_dataset,
+ batch_size=self.encode_kwargs["batch_size"],
+ shuffle=False,
+ collate_fn=custom_collate_fn,
+ num_workers=min(math.floor(os.cpu_count() / 2), 16),
+ )
+ if corpus_modality == "image":
+ sub_corpus_embeddings = self.model.get_image_embeddings(
+ images=corpus_image_dataloader,
+ task_name=task_name,
+ prompt_type=PromptType.passage,
+ **self.encode_kwargs,
+ )
+ elif corpus_modality == "image,text":
+ corpus_texts = chunk["text"]
+ sub_corpus_embeddings = self.model.get_fused_embeddings(
+ texts=corpus_texts,
+ images=corpus_image_dataloader,
+ task_name=task_name,
+ prompt_type=PromptType.passage,
+ **self.encode_kwargs,
+ )
+ else:
+ raise ValueError(f"Unsupported modality: {corpus_modality}")
+
+ cos_scores = self.score_functions[score_function](
+ query_embeddings, sub_corpus_embeddings
+ )
+ cos_scores[torch.isnan(cos_scores)] = -1
+
+ cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(
+ cos_scores,
+ min(top_k, cos_scores.size(1)),
+ dim=1,
+ largest=True,
+ sorted=return_sorted,
+ )
+ cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
+ cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist()
+
+ for query_itr in range(len(query_embeddings)):
+ query_id = query_ids[query_itr]
+ for sub_corpus_id, score in zip(
+ cos_scores_top_k_idx[query_itr], cos_scores_top_k_values[query_itr]
+ ):
+ corpus_id = chunk_ids[sub_corpus_id]
+ if len(result_heaps[query_id]) < top_k:
+ heapq.heappush(result_heaps[query_id], (score, corpus_id))
+ else:
+ heapq.heappushpop(result_heaps[query_id], (score, corpus_id))
+
+ for qid in result_heaps:
+ for score, corpus_id in result_heaps[qid]:
+ self.results[qid][corpus_id] = score
+
+ return self.results
+
+ def load_results_file(self):
+ # load the first stage results from file in format {qid: {doc_id: score}}
+ if "https://" in self.previous_results:
+ # download the file
+ if not os.path.exists(self.previous_results):
+ url_descriptor = self.previous_results.split("https://")[-1].replace(
+ "/", "--"
+ )
+ dest_file = os.path.join(
+ "results", f"cached_predictions--{url_descriptor}"
+ )
+ os.makedirs(os.path.dirname(os.path.abspath(dest_file)), exist_ok=True)
+ download(self.previous_results, dest_file)
+ logger.info(
+ f"Downloaded the previous results at {self.previous_results} to {dest_file}"
+ )
+ self.previous_results = dest_file
+
+ with open(self.previous_results) as f:
+ previous_results = json.load(f)
+ assert isinstance(previous_results, dict)
+ assert isinstance(previous_results[list(previous_results.keys())[0]], dict)
+ return previous_results
+
+
+# Adapted from https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/retrieval/evaluation.py#L9
+class Any2AnyRetrievalEvaluator(Evaluator):
+ def __init__(
+ self,
+ retriever=None,
+ task_name: str | None = None,
+ k_values: list[int] = [1, 3, 5, 10, 20, 100, 1000],
+ score_function: str = "cos_sim",
+ encode_kwargs: dict[str, Any] = {},
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ self.retriever = Any2AnyDenseRetrievalExactSearch(
+ retriever, encode_kwargs=encode_kwargs, **kwargs
+ )
+ self.k_values = k_values
+ self.top_k = (
+ max(k_values) if "top_k" not in kwargs else kwargs["top_k"]
+ ) # can lower it if reranking
+ self.score_function = score_function
+ self.task_name = task_name
+
+ def __call__(
+ self,
+ corpus: dict[str, dict[str, str | Image.Image]],
+ queries: dict[str, dict[str, str | Image.Image]],
+ ) -> dict[str, dict[str, float]]:
+ if not self.retriever:
+ raise ValueError("Model/Technique has not been provided!")
+
+ return self.retriever.search(
+ corpus,
+ queries,
+ self.top_k,
+ self.score_function,
+ task_name=self.task_name,
+ )
+
+ @staticmethod
+ def evaluate(
+ qrels: dict[str, dict[str, int]],
+ results: dict[str, dict[str, float]],
+ k_values: list[int],
+ ignore_identical_ids: bool = False,
+ skip_first_result: bool = False,
+ ) -> tuple[
+ dict[str, float],
+ dict[str, float],
+ dict[str, float],
+ dict[str, float],
+ dict[str, float],
+ ]:
+ if ignore_identical_ids:
+ logger.debug(
+ "For evaluation, ``ignore_identical_ids=True`` is set to True, the evaluator will ignore identical query and document ids."
+ )
+ # Remove identical ids from results dict
+ for qid, rels in results.items():
+ for pid in list(rels):
+ if qid == pid:
+ results[qid].pop(pid)
+ else:
+ logger.debug(
+ "For evaluation, we DO NOT ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=True`` to ignore this."
+ )
+
+ all_ndcgs, all_aps, all_recalls, all_precisions, all_cv_recalls = (
+ {},
+ {},
+ {},
+ {},
+ {},
+ )
+
+ for k in k_values:
+ all_ndcgs[f"NDCG@{k}"] = []
+ all_aps[f"MAP@{k}"] = []
+ all_recalls[f"Recall@{k}"] = []
+ all_precisions[f"P@{k}"] = []
+ all_cv_recalls[f"CV_Recall@{k}"] = [] # (new) CV-style Recall
+
+ map_string = "map_cut." + ",".join([str(k) for k in k_values])
+ ndcg_string = "ndcg_cut." + ",".join([str(k) for k in k_values])
+ recall_string = "recall." + ",".join([str(k) for k in k_values])
+ precision_string = "P." + ",".join([str(k) for k in k_values])
+ evaluator = pytrec_eval.RelevanceEvaluator(
+ qrels, {map_string, ndcg_string, recall_string, precision_string}
+ )
+ scores = evaluator.evaluate(results)
+
+ sorted_results = {
+ qid: sorted(rels.items(), key=lambda item: item[1], reverse=True)
+ for qid, rels in results.items()
+ }
+
+ if skip_first_result:
+ for qid, rels in sorted_results.items():
+ sorted_results[qid].pop(0)
+
+ for query_id in scores.keys():
+ top_docs = [
+ doc_id for doc_id, _ in sorted_results.get(query_id, [])
+ ] # Sorted list of doc IDs
+ relevant_docs = set(qrels.get(query_id, {}).keys())
+
+ for k in k_values:
+ top_k_docs = top_docs[:k]
+ all_ndcgs[f"NDCG@{k}"].append(scores[query_id]["ndcg_cut_" + str(k)])
+ all_aps[f"MAP@{k}"].append(scores[query_id]["map_cut_" + str(k)])
+ all_recalls[f"Recall@{k}"].append(scores[query_id]["recall_" + str(k)])
+ all_precisions[f"P@{k}"].append(scores[query_id]["P_" + str(k)])
+
+ if relevant_docs.intersection(top_k_docs):
+ all_cv_recalls[f"CV_Recall@{k}"].append(1.0)
+ else:
+ all_cv_recalls[f"CV_Recall@{k}"].append(0.0)
+
+ ndcg, _map, recall, precision, cv_recall = (
+ all_ndcgs.copy(),
+ all_aps.copy(),
+ all_recalls.copy(),
+ all_precisions.copy(),
+ all_cv_recalls.copy(),
+ )
+
+ for k in k_values:
+ ndcg[f"NDCG@{k}"] = round(sum(ndcg[f"NDCG@{k}"]) / len(scores), 5)
+ _map[f"MAP@{k}"] = round(sum(_map[f"MAP@{k}"]) / len(scores), 5)
+ recall[f"Recall@{k}"] = round(sum(recall[f"Recall@{k}"]) / len(scores), 5)
+ precision[f"P@{k}"] = round(sum(precision[f"P@{k}"]) / len(scores), 5)
+ cv_recall[f"CV_Recall@{k}"] = round(
+ sum(cv_recall[f"CV_Recall@{k}"]) / len(scores), 5
+ )
+
+ naucs = Any2AnyRetrievalEvaluator.evaluate_abstention(
+ results,
+ {**all_ndcgs, **all_aps, **all_recalls, **all_precisions, **all_cv_recalls},
+ )
+
+ return ndcg, _map, recall, precision, cv_recall, naucs
+
+ @staticmethod
+ def evaluate_custom(
+ qrels: dict[str, dict[str, int]],
+ results: dict[str, dict[str, float]],
+ k_values: list[int],
+ metric: str,
+ output_type: str = "all",
+ ) -> tuple[dict[str, float]]:
+ if metric.lower() in ["mrr", "mrr@k", "mrr_cut"]:
+ metric_scores = mrr(qrels, results, k_values, output_type)
+
+ elif metric.lower() in ["recall_cap", "r_cap", "r_cap@k"]:
+ metric_scores = recall_cap(qrels, results, k_values, output_type)
+
+ elif metric.lower() in ["hole", "hole@k"]:
+ metric_scores = hole(qrels, results, k_values, output_type)
+
+ elif metric.lower() in [
+ "acc",
+ "top_k_acc",
+ "accuracy",
+ "accuracy@k",
+ "top_k_accuracy",
+ ]:
+ metric_scores = top_k_accuracy(qrels, results, k_values, output_type)
+
+ naucs = Any2AnyRetrievalEvaluator.evaluate_abstention(results, metric_scores)
+ metric_scores_avg = {k: sum(v) / len(v) for k, v in metric_scores.items()}
+
+ return metric_scores_avg, naucs
+
+ @staticmethod
+ def evaluate_abstention(
+ results: dict[str, dict[str, float]],
+ metric_scores: dict[str, list[float]],
+ ) -> dict[str, float]:
+ """Computes normalized Area Under the Curve on a set of evaluated instances as presented in the paper https://arxiv.org/abs/2402.12997"""
+ all_sim_scores = [list(results[qid].values()) for qid in list(results.keys())]
+ all_conf_scores = [
+ confidence_scores(sim_scores) for sim_scores in all_sim_scores
+ ]
+ conf_fcts = list(all_conf_scores[0].keys())
+ all_conf_scores = {
+ fct: np.array([x[fct] for x in all_conf_scores]) for fct in conf_fcts
+ }
+ metric_scores = {k: np.array(v) for k, v in metric_scores.items()}
+ naucs = {}
+
+ for metric_name, scores in metric_scores.items():
+ for fct, conf_scores in all_conf_scores.items():
+ naucs[f"nAUC_{metric_name}_{fct}"] = nAUC(conf_scores, scores)
+
+ return naucs
+
+ @staticmethod
+ def calculate_cv_style_recall(
+ qrels: dict[str, dict[str, int]], results: dict[str, dict[str, float]], k: int
+ ) -> dict[str, float]:
+ """Calculate CV-style recall: Recall is 1 if any relevant document is
+ retrieved in the top k, otherwise 0.
+ """
+ cv_recalls = {}
+ for query_id, relevant_docs in qrels.items():
+ retrieved_docs = list(results.get(query_id, {}).keys())[
+ :k
+ ] # Retrieve top k documents
+ if any(doc_id in relevant_docs for doc_id in retrieved_docs):
+ cv_recalls[query_id] = (
+ 1.0 # If any relevant doc is found in top k, recall is 1
+ )
+ else:
+ cv_recalls[query_id] = 0.0 # Otherwise, recall is 0
+ return cv_recalls
diff --git a/mteb/evaluation/evaluators/Image/Any2TextMultipleChoiceEvaluator.py b/mteb/evaluation/evaluators/Image/Any2TextMultipleChoiceEvaluator.py
new file mode 100644
index 0000000000..a93714e770
--- /dev/null
+++ b/mteb/evaluation/evaluators/Image/Any2TextMultipleChoiceEvaluator.py
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+import numpy as np
+import torch
+from sklearn.metrics import accuracy_score
+from sklearn.metrics.pairwise import cosine_similarity
+from torchvision import transforms
+from tqdm import tqdm
+
+from mteb.encoder_interface import Encoder, EncoderWithSimilarity
+from mteb.evaluation.evaluators.Evaluator import Evaluator
+
+logger = logging.getLogger(__name__)
+
+transform = transforms.Compose([transforms.PILToTensor()])
+
+
+class Any2TextMultipleChoiceEvaluator(Evaluator):
+ """Evaluate a model based on the similarity of queries (can be interleaved) and candidate answers.
+ The goal is to find the correct text in multiple candidates that
+ forms the correct answer of the interleaved query.
+
+ Args:
+ query_modalities: the modality of queries; supports image and text or either at the moment,
+ query_column_names: column names of queries; should align with query modalities.
+ label_column_name: column name of labels;
+ choices_column_names: column name of candidate choices;
+ """
+
+ def __init__(
+ self,
+ dataset,
+ query_modalities: str | list[str],
+ query_column_names: dict,
+ label_column_name: str,
+ choices_column_name: str,
+ task_name: str | None = None,
+ transform=None,
+ limit: int | None = None,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ if limit:
+ dataset = dataset.select(range(limit))
+ self.dataset = dataset
+ self.query_modalities = query_modalities
+ self.query_column_names = query_column_names
+ self.label_column_name = label_column_name
+ self.choices_column_name = choices_column_name
+ self.task_name = task_name
+ self.transform = transform
+
+ def __call__(
+ self,
+ model: Encoder | EncoderWithSimilarity,
+ encode_kwargs: dict[str, Any] = {},
+ ):
+ if "batch_size" not in encode_kwargs:
+ encode_kwargs["batch_size"] = 64
+
+ label_list = list(
+ {x for n in self.dataset[self.choices_column_name] for x in n}
+ )
+ label_embeddings = model.get_text_embeddings(label_list)
+ label_embedding_dict = {}
+ for label, embedding in zip(label_list, label_embeddings):
+ label_embedding_dict[label] = embedding
+
+ if "text" in self.query_modalities:
+ questions = self.dataset[self.query_column_names["text"]]
+ else:
+ questions = None
+ if "image" in self.query_modalities:
+ images = self.dataset[self.query_column_names["image"]]
+ query_embeddings = model.get_fused_embeddings(
+ texts=questions,
+ images=images,
+ batch_size=encode_kwargs["batch_size"],
+ )
+
+ answers = self.dataset[self.label_column_name]
+ choices = self.dataset[self.choices_column_name]
+
+ # note that answers are the indeces
+ predictions = []
+ for q_embedding, choice in tqdm(zip(query_embeddings, choices)):
+ choice_embeddings = torch.vstack(
+ [label_embedding_dict[c] for c in choice]
+ ) # (choice_size, embedding_dim)
+ q_embedding = q_embedding[np.newaxis, :]
+ cos_sim = cosine_similarity(q_embedding, choice_embeddings)
+ predictions.append(np.argmax(cos_sim))
+
+ metrics = {}
+ metrics["accuracy"] = accuracy_score(predictions, answers)
+ return metrics
diff --git a/mteb/evaluation/evaluators/Image/ClassificationEvaluator.py b/mteb/evaluation/evaluators/Image/ClassificationEvaluator.py
new file mode 100644
index 0000000000..a0d84d5714
--- /dev/null
+++ b/mteb/evaluation/evaluators/Image/ClassificationEvaluator.py
@@ -0,0 +1,388 @@
+from __future__ import annotations
+
+import logging
+import math
+import os
+from typing import Any
+
+import numpy as np
+import torch
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import (
+ accuracy_score,
+ average_precision_score,
+ f1_score,
+)
+from sklearn.neighbors import KNeighborsClassifier
+from torch import Tensor
+from torch.utils.data import DataLoader
+from torchvision import transforms
+
+from mteb.encoder_interface import Encoder
+
+from ..Evaluator import Evaluator
+
+logger = logging.getLogger(__name__)
+
+
+def dot_distance(a: np.ndarray, b: np.ndarray) -> float:
+ return -np.dot(a, b)
+
+
+transform = transforms.Compose([transforms.PILToTensor()])
+
+
+class ImageDataset(torch.utils.data.Dataset):
+ def __init__(self, hf_dataset, image_column_name: str = "image", transform=None):
+ self.dataset = hf_dataset
+ self.transform = transform
+ self.image_column_name = image_column_name
+
+ def __len__(self):
+ return len(self.dataset)
+
+ def __getitem__(self, idx):
+ image = self.dataset[idx][self.image_column_name]
+ if image.mode != "RGB":
+ image = image.convert("RGB")
+ image = self.transform(image)
+ return image
+
+
+def custom_collate_fn(batch):
+ return batch
+
+
+class ImagekNNClassificationEvaluator(Evaluator):
+ def __init__(
+ self,
+ dataset_train,
+ dataset_test,
+ image_column_name,
+ label_column_name,
+ task_name: str | None = None,
+ k: int = 1,
+ encode_kwargs: dict[str, Any] = {},
+ limit: int | None = None,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ if limit is not None:
+ dataset_train = dataset_train.select(list(range(limit)))
+
+ self.dataset_train = ImageDataset(
+ dataset_train, image_column_name=image_column_name, transform=transform
+ )
+ self.y_train = dataset_train[label_column_name]
+
+ self.dataset_test = ImageDataset(
+ dataset_test, image_column_name=image_column_name, transform=transform
+ )
+ self.y_test = dataset_test[label_column_name]
+ self.task_name = task_name
+ self.encode_kwargs = encode_kwargs
+
+ if "batch_size" not in self.encode_kwargs:
+ self.encode_kwargs["batch_size"] = 32
+
+ self.k = k
+
+ def __call__(self, model, test_cache=None):
+ scores = {}
+ max_accuracy = 0
+ max_f1 = 0
+ max_ap = 0
+ dataloader_train = DataLoader(
+ self.dataset_train,
+ batch_size=self.encode_kwargs["batch_size"],
+ shuffle=False,
+ collate_fn=custom_collate_fn,
+ num_workers=min(math.floor(os.cpu_count() / 2), 16),
+ )
+ X_train = model.get_image_embeddings(
+ dataloader_train, batch_size=self.encode_kwargs["batch_size"]
+ )
+ dataloader = DataLoader(
+ self.dataset_test,
+ batch_size=self.encode_kwargs["batch_size"],
+ shuffle=False,
+ num_workers=min(math.floor(os.cpu_count() / 2), 16),
+ )
+ if test_cache is None:
+ X_test = model.get_image_embeddings(
+ dataloader, batch_size=self.encode_kwargs["batch_size"]
+ )
+ test_cache = X_test
+ else:
+ X_test = test_cache
+ for metric in ["cosine", "euclidean"]: # TODO: "dot"
+ knn = KNeighborsClassifier(n_neighbors=self.k, n_jobs=-1, metric=metric)
+ knn.fit(X_train, self.y_train)
+ y_pred = knn.predict(X_test)
+ accuracy = accuracy_score(self.y_test, y_pred)
+ f1 = f1_score(self.y_test, y_pred, average="macro")
+ scores["accuracy_" + metric] = accuracy
+ scores["f1_" + metric] = f1
+ max_accuracy = max(max_accuracy, accuracy)
+ max_f1 = max(max_f1, f1) # type: ignore
+ # if binary classification
+ if len(np.unique(self.y_train)) == 2:
+ ap = average_precision_score(self.y_test, y_pred)
+ scores["ap_" + metric] = ap
+ max_ap = max(max_ap, ap)
+ scores["accuracy"] = max_accuracy
+ scores["f1"] = max_f1
+ if len(np.unique(self.y_train)) == 2:
+ scores["ap"] = max_ap
+ return scores, test_cache
+
+
+class ImagekNNClassificationEvaluatorPytorch(Evaluator):
+ def __init__(
+ self,
+ dataset_train,
+ dataset_test,
+ image_column_name,
+ label_column_name,
+ task_name: str,
+ k: int = 1,
+ encode_kwargs: dict[str, Any] = {},
+ limit: int | None = None,
+ **kwargs: Any,
+ ):
+ super().__init__(**kwargs)
+ if limit is not None:
+ dataset_train = dataset_train.select(list(range(limit)))
+
+ self.dataset_train = ImageDataset(
+ dataset_train, image_column_name=image_column_name, transform=transform
+ )
+ self.y_train = dataset_train[label_column_name]
+
+ self.dataset_test = ImageDataset(
+ dataset_test, image_column_name=image_column_name, transform=transform
+ )
+ self.y_test = dataset_test[label_column_name]
+ self.task_name = task_name
+ self.encode_kwargs = encode_kwargs
+
+ if "batch_size" not in self.encode_kwargs:
+ self.encode_kwargs["batch_size"] = 32
+
+ self.k = k
+
+ def __call__(self, model: Encoder, test_cache=None):
+ scores = {}
+ max_accuracy = 0
+ max_f1 = 0
+ max_ap = 0
+
+ dataloader_train = DataLoader(
+ self.dataset_train,
+ batch_size=self.encode_kwargs["batch_size"],
+ shuffle=False,
+ collate_fn=custom_collate_fn,
+ num_workers=min(math.floor(os.cpu_count() / 2), 16),
+ )
+ X_train = model.get_image_embeddings(
+ dataloader_train, batch_size=self.encode_kwargs["batch_size"]
+ )
+
+ dataloader = DataLoader(
+ self.dataset_test,
+ batch_size=self.encode_kwargs["batch_size"],
+ shuffle=False,
+ num_workers=min(math.floor(os.cpu_count() / 2), 16),
+ )
+ if test_cache is None:
+ X_test = model.get_image_embeddings(
+ dataloader, batch_size=self.encode_kwargs["batch_size"]
+ )
+ test_cache = X_test
+ else:
+ X_test = test_cache
+ for metric in ["cosine", "euclidean", "dot"]:
+ if metric == "cosine":
+ distances = 1 - self._cos_sim(X_test, X_train)
+ elif metric == "euclidean":
+ distances = self._euclidean_dist(X_test, X_train)
+ elif metric == "dot":
+ distances = -self._dot_score(X_test, X_train)
+ neigh_indices = torch.topk(
+ distances, k=self.k, dim=1, largest=False
+ ).indices
+ y_train = torch.tensor(self.y_train)
+ y_pred = torch.mode(
+ y_train[neigh_indices], dim=1
+ ).values # TODO: case where there is no majority
+ y_pred = y_pred.tolist()
+ accuracy = accuracy_score(self.y_test, y_pred)
+ f1 = f1_score(self.y_test, y_pred, average="macro")
+ scores["accuracy_" + metric] = accuracy
+ scores["f1_" + metric] = f1
+ max_accuracy = max(max_accuracy, accuracy)
+ max_f1 = max(max_f1, f1) # type: ignore
+ # if binary classification
+ if len(np.unique(self.y_train)) == 2:
+ ap = average_precision_score(self.y_test, y_pred)
+ scores["ap_" + metric] = ap
+ max_ap = max(max_ap, ap)
+ scores["accuracy"] = max_accuracy
+ scores["f1"] = max_f1
+ if len(np.unique(self.y_train)) == 2:
+ scores["ap"] = max_ap
+ return scores, test_cache
+
+ @staticmethod
+ def _cos_sim(a: Tensor, b: Tensor):
+ """Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
+
+ Return:
+ Matrix with res[i][j] = cos_sim(a[i], b[j])
+ """
+ if not isinstance(a, torch.Tensor):
+ a = torch.tensor(a)
+
+ if not isinstance(b, torch.Tensor):
+ b = torch.tensor(b)
+
+ if len(a.shape) == 1:
+ a = a.unsqueeze(0)
+
+ if len(b.shape) == 1:
+ b = b.unsqueeze(0)
+
+ a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
+ b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
+ return torch.mm(a_norm, b_norm.transpose(0, 1))
+
+ @staticmethod
+ def _euclidean_dist(a: Tensor, b: Tensor):
+ """Computes the euclidean distance euclidean_dist(a[i], b[j]) for all i and j.
+
+ Returns:
+ Matrix with res[i][j] = euclidean_dist(a[i], b[j])
+ """
+ if not isinstance(a, torch.Tensor):
+ a = torch.tensor(a)
+
+ if not isinstance(b, torch.Tensor):
+ b = torch.tensor(b)
+
+ if len(a.shape) == 1:
+ a = a.unsqueeze(0)
+
+ if len(b.shape) == 1:
+ b = b.unsqueeze(0)
+
+ return torch.cdist(a, b, p=2)
+
+ @staticmethod
+ def _dot_score(a: Tensor, b: Tensor):
+ """Computes the dot-product dot_prod(a[i], b[j]) for all i and j.
+
+ Returns:
+ Matrix with res[i][j] = dot_prod(a[i], b[j])
+ """
+ if not isinstance(a, torch.Tensor):
+ a = torch.tensor(a)
+
+ if not isinstance(b, torch.Tensor):
+ b = torch.tensor(b)
+
+ if len(a.shape) == 1:
+ a = a.unsqueeze(0)
+
+ if len(b.shape) == 1:
+ b = b.unsqueeze(0)
+
+ return torch.mm(a, b.transpose(0, 1))
+
+
+class ImagelogRegClassificationEvaluator(Evaluator):
+ def __init__(
+ self,
+ dataset_train,
+ dataset_test,
+ image_column_name,
+ label_column_name,
+ task_name: str,
+ max_iter: int = 100,
+ encode_kwargs: dict[str, Any] = {},
+ limit: int | None = None,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.encode_kwargs = encode_kwargs
+
+ if "batch_size" not in self.encode_kwargs:
+ self.encode_kwargs["batch_size"] = 32
+
+ if limit is not None:
+ dataset_train = dataset_train.select(list(range(limit)))
+
+ self.dataset_train = ImageDataset(
+ dataset_train, image_column_name=image_column_name, transform=transform
+ )
+ self.y_train = dataset_train[label_column_name]
+ self.dataset_test = ImageDataset(
+ dataset_test, image_column_name=image_column_name, transform=transform
+ )
+ self.y_test = dataset_test[label_column_name]
+
+ self.max_iter = max_iter
+ self.task_name = task_name
+
+ def __call__(self, model, test_cache=None):
+ scores = {}
+ clf = LogisticRegression(
+ random_state=self.seed,
+ n_jobs=-1,
+ max_iter=self.max_iter,
+ verbose=1 if logger.isEnabledFor(logging.DEBUG) else 0,
+ )
+ dataloader_train = DataLoader(
+ self.dataset_train,
+ batch_size=self.encode_kwargs["batch_size"],
+ shuffle=False,
+ collate_fn=custom_collate_fn,
+ num_workers=min(math.floor(os.cpu_count() / 2), 16),
+ )
+ X_train = model.get_image_embeddings(
+ dataloader_train, batch_size=self.encode_kwargs["batch_size"]
+ )
+ dataloader = DataLoader(
+ self.dataset_test,
+ batch_size=self.encode_kwargs["batch_size"],
+ shuffle=False,
+ collate_fn=custom_collate_fn,
+ num_workers=min(math.floor(os.cpu_count() / 2), 16),
+ )
+ if test_cache is None:
+ X_test = model.get_image_embeddings(
+ dataloader, batch_size=self.encode_kwargs["batch_size"]
+ )
+ test_cache = X_test
+ else:
+ X_test = test_cache
+ logger.info("Fitting logistic regression classifier...")
+ if X_train.dtype == torch.bfloat16:
+ X_train = X_train.to(torch.float32)
+ if X_test.dtype == torch.bfloat16:
+ X_test = X_test.to(torch.float32)
+ clf.fit(X_train, self.y_train)
+ logger.info("Evaluating...")
+ y_pred = clf.predict(X_test)
+ scores["accuracy"] = accuracy_score(self.y_test, y_pred)
+ scores["f1"] = f1_score(self.y_test, y_pred, average="macro")
+ scores["f1_weighted"] = f1_score(self.y_test, y_pred, average="weighted")
+
+ # if binary classification
+ if len(np.unique(self.y_train)) == 2:
+ scores["ap"] = average_precision_score(self.y_test, y_pred, average="macro")
+ scores["ap_weighted"] = average_precision_score(
+ self.y_test, y_pred, average="weighted"
+ )
+
+ return scores, test_cache
diff --git a/mteb/evaluation/evaluators/Image/ClusteringEvaluator.py b/mteb/evaluation/evaluators/Image/ClusteringEvaluator.py
new file mode 100644
index 0000000000..fbf5e0bb14
--- /dev/null
+++ b/mteb/evaluation/evaluators/Image/ClusteringEvaluator.py
@@ -0,0 +1,74 @@
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+import sklearn
+import sklearn.cluster
+from PIL import Image
+from scipy.optimize import linear_sum_assignment
+from sklearn import metrics
+
+from mteb.encoder_interface import Encoder
+from mteb.evaluation.evaluators.Evaluator import Evaluator
+
+logger = logging.getLogger(__name__)
+
+
+class ImageClusteringEvaluator(Evaluator):
+ def __init__(
+ self,
+ images: list[Image.Image],
+ labels: list[int],
+ task_name: str | None = None,
+ clustering_batch_size: int = 500,
+ limit: int | None = None,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ if limit is not None:
+ images = images[:limit]
+ labels = labels[:limit]
+ self.images = images
+ self.labels = labels
+ self.clustering_batch_size = clustering_batch_size
+ self.task_name = task_name
+
+ def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}):
+ if "batch_size" not in encode_kwargs:
+ encode_kwargs["batch_size"] = 32
+
+ image_embeddings = model.get_image_embeddings(
+ self.images,
+ batch_size=encode_kwargs["batch_size"],
+ )
+
+ logger.info("Fitting Mini-Batch K-Means model...")
+ clustering_model = sklearn.cluster.MiniBatchKMeans(
+ n_clusters=len(set(self.labels)),
+ batch_size=self.clustering_batch_size,
+ n_init="auto",
+ )
+ clustering_model.fit(image_embeddings)
+ cluster_assignment = clustering_model.labels_
+
+ logger.info("Evaluating...")
+ v_measure = metrics.cluster.v_measure_score(self.labels, cluster_assignment)
+ nmi = metrics.cluster.normalized_mutual_info_score(
+ self.labels, cluster_assignment
+ )
+ ari = metrics.cluster.adjusted_rand_score(self.labels, cluster_assignment)
+
+ matrix = metrics.confusion_matrix(self.labels, cluster_assignment)
+
+ # get linear sum assignment
+ row_ind, col_ind = linear_sum_assignment(matrix, maximize=True)
+ total_correct = matrix[row_ind, col_ind].sum()
+ clustering_accuracy = total_correct / len(self.labels)
+
+ return {
+ "v_measure": v_measure,
+ "nmi": nmi,
+ "ari": ari,
+ "cluster_accuracy": clustering_accuracy,
+ }
diff --git a/mteb/evaluation/evaluators/Image/ImageTextPairClassificationEvaluator.py b/mteb/evaluation/evaluators/Image/ImageTextPairClassificationEvaluator.py
new file mode 100644
index 0000000000..7e3d84bb87
--- /dev/null
+++ b/mteb/evaluation/evaluators/Image/ImageTextPairClassificationEvaluator.py
@@ -0,0 +1,173 @@
+from __future__ import annotations
+
+import logging
+import math
+import os
+from typing import Any
+
+import torch
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+from torchvision import transforms
+
+from mteb.encoder_interface import Encoder, EncoderWithSimilarity
+from mteb.evaluation.evaluators.Evaluator import Evaluator
+
+logger = logging.getLogger(__name__)
+
+transform = transforms.Compose([transforms.PILToTensor()])
+
+
+class ImageTextDataset(torch.utils.data.Dataset):
+ def __init__(
+ self, hf_dataset, images_column_names, texts_column_names, transform=None
+ ):
+ self.dataset = hf_dataset
+ self.transform = transform
+ self.images_column_names = images_column_names
+ self.texts_column_names = texts_column_names
+
+ def __len__(self):
+ return len(self.dataset)
+
+ def __getitem__(self, idx):
+ data = self.dataset[idx]
+
+ # Get images
+ if isinstance(self.images_column_names, str):
+ images = data[self.images_column_names]
+ else:
+ images = [data[col] for col in self.images_column_names]
+
+ # Apply transforms to images
+ if self.transform is not None:
+ images = [self.transform(img) for img in images]
+
+ # Get texts
+ if isinstance(self.texts_column_names, str):
+ texts = data[self.texts_column_names]
+ else:
+ texts = [data[col] for col in self.texts_column_names]
+
+ return images, texts
+
+
+def custom_collate_fn(batch):
+ return batch
+
+
+class ImageTextPairClassificationEvaluator(Evaluator):
+ """Evaluate a model based on the similarity of the embeddings by calculating the accuracy of
+ identifying similar and dissimilar image caption pairs.
+ The goal is to find the correct image for each caption and the correct caption for each image.
+ This is done by computing the similarities between each image and each caption.
+ The results are written in a CSV. If a CSV already exists, then values are appended.
+ The labels need to be 0 for dissimilar pairs and 1 for similar pairs.
+
+ Args:
+ images: Each row is a list of images.
+ texts: Each row is a list of captions.
+ batch_size: Batch size used to compute embeddings
+ """
+
+ def __init__(
+ self,
+ dataset,
+ images_column_names: str | list[str],
+ texts_column_names: str | list[str],
+ task_name: str | None = None,
+ transform=None,
+ limit: int | None = None,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ if limit:
+ dataset = dataset.select(range(limit))
+ self.dataset = dataset
+ self.images_column_names = images_column_names
+ self.texts_column_names = texts_column_names
+ self.task_name = task_name
+ self.transform = transform
+
+ def __call__(
+ self,
+ model: Encoder | EncoderWithSimilarity,
+ encode_kwargs: dict[str, Any] = {},
+ ):
+ if "batch_size" not in encode_kwargs:
+ encode_kwargs["batch_size"] = 64
+
+ data_loader = DataLoader(
+ ImageTextDataset(
+ self.dataset,
+ self.images_column_names,
+ self.texts_column_names,
+ transform=self.transform,
+ ),
+ batch_size=encode_kwargs["batch_size"],
+ shuffle=False,
+ # collate_fn=lambda x: x, # Identity collate function
+ collate_fn=custom_collate_fn,
+ num_workers=min(math.floor(os.cpu_count() / 2), 16),
+ )
+
+ num_images_per_sample = (
+ len(self.images_column_names)
+ if isinstance(self.images_column_names, list)
+ else 1
+ )
+ num_texts_per_sample = (
+ len(self.texts_column_names)
+ if isinstance(self.texts_column_names, list)
+ else 1
+ )
+
+ img_ground_truths = torch.arange(num_images_per_sample)
+ caption_ground_truths = torch.arange(num_texts_per_sample)
+
+ image_score = []
+ text_score = []
+ score = []
+
+ for batch in data_loader:
+ images_list, texts_list = zip(*batch)
+ images = [img for images in images_list for img in images]
+ texts = [txt for texts in texts_list for txt in texts]
+ images_emb = F.normalize(
+ model.get_image_embeddings(images, batch_size=len(images)),
+ dim=-1,
+ ).view(len(batch), num_images_per_sample, -1)
+ texts_emb = F.normalize(
+ model.get_text_embeddings(texts, batch_size=len(texts)),
+ dim=-1,
+ ).view(len(batch), num_texts_per_sample, -1)
+ for i in range(len(batch)):
+ img_emb = images_emb[i]
+ txt_emb = texts_emb[i]
+
+ scores = (
+ img_emb @ txt_emb.t()
+ ) # shape = (num_images_per_sample x num_texts_per_sample)
+
+ image_closest_text = scores.argmax(
+ dim=1
+ ) # shape = (num_images_per_sample)
+ text_closest_image = scores.argmax(
+ dim=0
+ ) # shape = (num_texts_per_sample)
+ pred_text_is_correct = (
+ (image_closest_text == img_ground_truths).all().item()
+ )
+ pred_image_is_correct = (
+ (text_closest_image == caption_ground_truths).all().item()
+ )
+ all_correct = pred_text_is_correct and pred_image_is_correct
+ image_score.append(pred_image_is_correct)
+ text_score.append(pred_text_is_correct)
+ score.append(all_correct)
+
+ metrics = {}
+ metrics["image_acc"] = torch.Tensor(image_score).float().mean().item()
+ metrics["text_acc"] = torch.Tensor(text_score).float().mean().item()
+ metrics["accuracy"] = torch.Tensor(score).float().mean().item()
+ return metrics
diff --git a/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py b/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py
new file mode 100644
index 0000000000..a042d22f5a
--- /dev/null
+++ b/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py
@@ -0,0 +1,141 @@
+from __future__ import annotations
+
+import logging
+import math
+import os
+from typing import Any
+
+import numpy as np
+import torch
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics.pairwise import (
+ paired_cosine_distances,
+ paired_euclidean_distances,
+ paired_manhattan_distances,
+)
+from torch.utils.data import DataLoader
+from torchvision import transforms
+
+from ..Evaluator import Evaluator
+
+logger = logging.getLogger(__name__)
+
+transform = transforms.Compose([transforms.PILToTensor()])
+
+
+class ImageDataset(torch.utils.data.Dataset):
+ def __init__(self, hf_dataset, image_column_name: str = "image", transform=None):
+ self.dataset = hf_dataset
+ self.transform = transform
+ self.image_column_name = image_column_name
+
+ def __len__(self):
+ return len(self.dataset)
+
+ def __getitem__(self, idx):
+ image = self.dataset[idx][self.image_column_name]
+ if image.mode != "RGB":
+ image = image.convert("RGB")
+ image = self.transform(image)
+ return image
+
+
+def custom_collate_fn(batch):
+ return batch
+
+
+class VisualSTSEvaluator(Evaluator):
+ def __init__(
+ self,
+ dataset,
+ sentences_column_names: list[str],
+ gold_scores: list[float],
+ task_name: str | None = None,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.sentence1_dataset = ImageDataset(
+ dataset, image_column_name=sentences_column_names[0], transform=transform
+ )
+ self.sentence2_dataset = ImageDataset(
+ dataset, image_column_name=sentences_column_names[1], transform=transform
+ )
+ self.gold_scores = gold_scores
+ self.task_name = task_name
+ # TODO use task_name for prompts with interleaved encoding.
+
+ def __call__(
+ self,
+ model, # TODO: model type
+ *,
+ encode_kwargs: dict[str, Any] = {},
+ ):
+ if "batch_size" not in encode_kwargs:
+ encode_kwargs["batch_size"] = 32
+
+ sentence1_dataloader = DataLoader(
+ self.sentence1_dataset,
+ batch_size=encode_kwargs["batch_size"],
+ shuffle=False,
+ collate_fn=custom_collate_fn,
+ num_workers=min(math.floor(os.cpu_count() / 2), 16),
+ )
+ sentence2_dataloader = DataLoader(
+ self.sentence2_dataset,
+ batch_size=encode_kwargs["batch_size"],
+ shuffle=False,
+ collate_fn=custom_collate_fn,
+ num_workers=min(math.floor(os.cpu_count() / 2), 16),
+ )
+
+ embeddings1 = model.get_image_embeddings(
+ sentence1_dataloader, batch_size=encode_kwargs["batch_size"]
+ )
+ embeddings2 = model.get_image_embeddings(
+ sentence2_dataloader, batch_size=encode_kwargs["batch_size"]
+ )
+
+ logger.info("Evaluating...")
+ cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
+ manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)
+ euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)
+
+ cosine_pearson, _ = pearsonr(self.gold_scores, cosine_scores)
+ cosine_spearman, _ = spearmanr(self.gold_scores, cosine_scores)
+
+ manhatten_pearson, _ = pearsonr(self.gold_scores, manhattan_distances)
+ manhatten_spearman, _ = spearmanr(self.gold_scores, manhattan_distances)
+
+ euclidean_pearson, _ = pearsonr(self.gold_scores, euclidean_distances)
+ euclidean_spearman, _ = spearmanr(self.gold_scores, euclidean_distances)
+
+ similarity_scores = None
+ if hasattr(model, "similarity_pairwise"):
+ similarity_scores = model.similarity_pairwise(embeddings1, embeddings2) # type: ignore
+ elif hasattr(model, "similarity"):
+ _similarity_scores = [
+ float(model.similarity(e1, e2)) # type: ignore
+ for e1, e2 in zip(embeddings1, embeddings2)
+ ]
+ similarity_scores = np.array(_similarity_scores)
+
+ if similarity_scores is not None:
+ pearson = pearsonr(self.gold_scores, similarity_scores)
+ spearman = spearmanr(self.gold_scores, similarity_scores)
+ else:
+ # if model does not have a similarity function, we assume the cosine similarity
+ pearson = cosine_pearson
+ spearman = cosine_spearman
+
+ return {
+ # using the models own similarity score
+ "pearson": pearson,
+ "spearman": spearman,
+ # generic similarity scores
+ "cosine_pearson": cosine_pearson,
+ "cosine_spearman": cosine_spearman,
+ "manhattan_pearson": manhatten_pearson,
+ "manhattan_spearman": manhatten_spearman,
+ "euclidean_pearson": euclidean_pearson,
+ "euclidean_spearman": euclidean_spearman,
+ }
diff --git a/mteb/evaluation/evaluators/Image/ZeroshotClassificationEvaluator.py b/mteb/evaluation/evaluators/Image/ZeroshotClassificationEvaluator.py
new file mode 100644
index 0000000000..2ef9609ea4
--- /dev/null
+++ b/mteb/evaluation/evaluators/Image/ZeroshotClassificationEvaluator.py
@@ -0,0 +1,89 @@
+from __future__ import annotations
+
+import logging
+import math
+import os
+from typing import Any
+
+import torch
+from sklearn import metrics
+from torch.utils.data import DataLoader
+from torchvision import transforms
+
+from mteb.encoder_interface import Encoder
+
+from ..Evaluator import Evaluator
+
+logger = logging.getLogger(__name__)
+
+transform = transforms.Compose([transforms.PILToTensor()])
+
+
+class ImageDataset(torch.utils.data.Dataset):
+ def __init__(self, hf_dataset, image_column_name: str = "image", transform=None):
+ self.dataset = hf_dataset
+ self.transform = transform
+ self.image_column_name = image_column_name
+
+ def __len__(self):
+ return len(self.dataset)
+
+ def __getitem__(self, idx):
+ image = self.dataset[idx][self.image_column_name]
+ if image.mode != "RGB":
+ image = image.convert("RGB")
+ image = self.transform(image)
+ return image
+
+
+def custom_collate_fn(batch):
+ return batch
+
+
+class ZeroshotClassificationEvaluator(Evaluator):
+ def __init__(
+ self,
+ dataset,
+ image_column_name: str,
+ labels: list[int],
+ candidate_labels: list[str],
+ task_name: str | None = None,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.dataset = ImageDataset(
+ dataset, image_column_name=image_column_name, transform=transform
+ )
+ self.image_column_name = image_column_name
+ self.labels = labels
+ self.candidate_labels = candidate_labels
+ self.task_name = task_name
+
+ def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}):
+ if "batch_size" not in encode_kwargs:
+ encode_kwargs["batch_size"] = 32
+
+ dataloader = DataLoader(
+ self.dataset,
+ batch_size=encode_kwargs["batch_size"],
+ shuffle=False,
+ collate_fn=custom_collate_fn,
+ num_workers=min(math.floor(os.cpu_count() / 2), 16),
+ )
+
+ text_embeddings = model.get_text_embeddings(
+ self.candidate_labels, batch_size=encode_kwargs["batch_size"]
+ )
+
+ image_embeddings = model.get_image_embeddings(
+ dataloader, batch_size=encode_kwargs["batch_size"]
+ )
+
+ probs = model.calculate_probs(text_embeddings, image_embeddings)
+ predictions = probs.argmax(dim=1)
+
+ logger.info("Evaluating...")
+
+ accuracy = metrics.accuracy_score(self.labels, predictions.tolist())
+
+ return {"accuracy": accuracy}
diff --git a/mteb/evaluation/evaluators/Image/__init__.py b/mteb/evaluation/evaluators/Image/__init__.py
new file mode 100644
index 0000000000..a5654a6e8c
--- /dev/null
+++ b/mteb/evaluation/evaluators/Image/__init__.py
@@ -0,0 +1,2 @@
+# from .ClassificationEvaluator import *
+# from .ZeroshotClassificationEvaluator import *
diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py
index 77b8ecc0a4..cdf497e5a6 100644
--- a/mteb/evaluation/evaluators/RetrievalEvaluator.py
+++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py
@@ -371,8 +371,9 @@ class DRESModel:
mteb_model_meta: ModelMeta | None
def __init__(self, model, **kwargs):
- self.model = model
+ self.model: Any = model
self.use_sbert_model = isinstance(model, SentenceTransformer)
+ self.device = model.device if hasattr(model, "device") else None
self.save_corpus_embeddings = kwargs.get("save_corpus_embeddings", False)
self.corpus_embeddings = {}
diff --git a/mteb/evaluation/evaluators/__init__.py b/mteb/evaluation/evaluators/__init__.py
index a1dc8faaa5..d6ad94a88d 100644
--- a/mteb/evaluation/evaluators/__init__.py
+++ b/mteb/evaluation/evaluators/__init__.py
@@ -3,6 +3,14 @@
from .BitextMiningEvaluator import *
from .ClassificationEvaluator import *
from .ClusteringEvaluator import *
+from .Image.Any2AnyMultiChoiceEvaluator import *
+from .Image.Any2AnyRetrievalEvaluator import *
+from .Image.Any2TextMultipleChoiceEvaluator import *
+from .Image.ClassificationEvaluator import *
+from .Image.ClusteringEvaluator import *
+from .Image.ImageTextPairClassificationEvaluator import *
+from .Image.VisualSTSEvaluator import *
+from .Image.ZeroshotClassificationEvaluator import *
from .PairClassificationEvaluator import *
from .RerankingEvaluator import *
from .RetrievalEvaluator import *
diff --git a/mteb/modalities.py b/mteb/modalities.py
new file mode 100644
index 0000000000..ff83f963af
--- /dev/null
+++ b/mteb/modalities.py
@@ -0,0 +1,8 @@
+from __future__ import annotations
+
+from typing_extensions import Literal
+
+MODALITIES = Literal[
+ "text",
+ "image",
+]
diff --git a/mteb/model_meta.py b/mteb/model_meta.py
index 48c8b1295f..6486c849ea 100644
--- a/mteb/model_meta.py
+++ b/mteb/model_meta.py
@@ -11,6 +11,7 @@
from mteb.encoder_interface import Encoder
from .languages import ISO_LANGUAGE_SCRIPT
+from .modalities import MODALITIES
if TYPE_CHECKING:
from .models.sentence_transformer_wrapper import SentenceTransformerWrapper
@@ -80,6 +81,7 @@ class ModelMeta(BaseModel):
a benchmark as well as mark dataset contaminations.
adapted_from: Name of the model from which this model is adapted from. For quantizations, fine-tunes, long doc extensions, etc.
superseded_by: Name of the model that supersedes this model, e.g. nvidia/NV-Embed-v2 supersedes v1.
+ modalities: A list of strings representing the modalities the model supports. Default is ["text].
"""
model_config = ConfigDict(extra="forbid")
@@ -103,6 +105,7 @@ class ModelMeta(BaseModel):
training_datasets: dict[str, list[str]] | None
adapted_from: str | None = None
superseded_by: str | None = None
+ modalities: list[MODALITIES] = ["text"]
def to_dict(self):
dict_repr = self.model_dump()
diff --git a/mteb/models/align_models.py b/mteb/models/align_models.py
new file mode 100644
index 0000000000..95fb6fda25
--- /dev/null
+++ b/mteb/models/align_models.py
@@ -0,0 +1,160 @@
+from __future__ import annotations
+
+from functools import partial
+from typing import Any
+
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoModel, AutoProcessor
+
+from mteb.encoder_interface import PromptType
+from mteb.model_meta import ModelMeta
+
+
+class ALIGNModelWrapper:
+ def __init__(
+ self,
+ model_name: str,
+ device: str = "cuda" if torch.cuda.is_available() else "cpu",
+ **kwargs: Any,
+ ):
+ self.model_name = model_name
+ self.device = device
+ self.model = AutoModel.from_pretrained(model_name).to(self.device)
+ self.processor = AutoProcessor.from_pretrained(model_name)
+
+ def get_text_embeddings(
+ self,
+ texts: list[str],
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_text_embeddings = []
+
+ with torch.no_grad():
+ for i in tqdm(range(0, len(texts), batch_size)):
+ batch_texts = texts[i : i + batch_size]
+ inputs = self.processor(
+ text=batch_texts, return_tensors="pt", padding=True, truncation=True
+ )
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ text_outputs = self.model.get_text_features(**inputs)
+ all_text_embeddings.append(text_outputs.cpu())
+
+ all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
+ return all_text_embeddings
+
+ def get_image_embeddings(
+ self,
+ images: list[Image.Image] | DataLoader,
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_image_embeddings = []
+ if isinstance(images, DataLoader):
+ with torch.no_grad():
+ for batch in tqdm(images):
+ inputs = self.processor(
+ images=batch, return_tensors="pt", padding=True
+ )
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ image_outputs = self.model.get_image_features(**inputs)
+ all_image_embeddings.append(image_outputs.cpu())
+ else:
+ with torch.no_grad():
+ for i in tqdm(range(0, len(images), batch_size)):
+ batch_images = images[i : i + batch_size]
+ batch_images = [
+ img.convert("RGB")
+ if isinstance(img, Image.Image) and img.mode != "RGB"
+ else img
+ for img in batch_images
+ ]
+ inputs = self.processor(
+ images=batch_images, return_tensors="pt", padding=True
+ )
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ image_outputs = self.model.get_image_features(**inputs)
+ all_image_embeddings.append(image_outputs.cpu())
+
+ all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
+ return all_image_embeddings
+
+ def calculate_probs(self, text_embeddings, image_embeddings):
+ text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
+ image_embeddings = image_embeddings / image_embeddings.norm(
+ dim=-1, keepdim=True
+ )
+ logits = torch.matmul(image_embeddings, text_embeddings.T)
+ probs = (logits * 100).softmax(dim=-1)
+ return probs
+
+ def get_fused_embeddings(
+ self,
+ texts: list[str] = None,
+ images: list[Image.Image] | DataLoader = None,
+ fusion_mode="sum",
+ **kwargs: Any,
+ ):
+ if texts is None and images is None:
+ raise ValueError("Either texts or images must be provided")
+
+ text_embeddings = None
+ image_embeddings = None
+
+ if texts is not None:
+ text_embeddings = self.get_text_embeddings(texts, **kwargs)
+
+ if images is not None:
+ image_embeddings = self.get_image_embeddings(images, **kwargs)
+
+ if text_embeddings is not None and image_embeddings is not None:
+ if len(text_embeddings) != len(image_embeddings):
+ raise ValueError(
+ "The number of texts and images must have the same length"
+ )
+ if fusion_mode == "sum":
+ fused_embeddings = text_embeddings + image_embeddings
+ else:
+ # to do: add other fusion mode
+ raise ValueError(f"fusion mode {fusion_mode} hasn't been implemented")
+ return fused_embeddings
+ elif text_embeddings is not None:
+ return text_embeddings
+ elif image_embeddings is not None:
+ return image_embeddings
+
+
+align_base = ModelMeta(
+ loader=partial(
+ ALIGNModelWrapper,
+ model_name="kakaobrain/align-base",
+ ),
+ name="kakaobrain/align-base",
+ languages=["eng_Latn"],
+ revision="e96a37facc7b1f59090ece82293226b817afd6ba",
+ release_date="2023-02-24",
+ modalities=["image", "text"],
+ n_parameters=176_000_000,
+ max_tokens=64,
+ embed_dim=768,
+ license=None,
+ open_weights=True,
+ public_training_code="https://github.com/kakaobrain/coyo-align",
+ public_training_data=True,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/kakaobrain/align-base",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets={
+ # COYO-700M
+ },
+)
diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py
new file mode 100644
index 0000000000..e396cf39f0
--- /dev/null
+++ b/mteb/models/blip2_models.py
@@ -0,0 +1,270 @@
+from __future__ import annotations
+
+from functools import partial
+from typing import Any
+
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import Blip2Processor
+
+from mteb.encoder_interface import PromptType
+from mteb.model_meta import ModelMeta
+
+
+def blip2_loader(**kwargs):
+ try: # a temporal fix for the dependency issues.
+ from lavis.models.blip2_models.blip2_image_text_matching import (
+ Blip2ITM,
+ )
+ except ImportError:
+ raise ImportError(
+ "Please install `pip install mteb[blip2]` to use BLIP-2 models."
+ )
+
+ class BLIP2ModelWrapper:
+ def __init__(
+ self,
+ model_name: str,
+ device: str = "cuda" if torch.cuda.is_available() else "cpu",
+ **kwargs: Any,
+ ):
+ self.model_name = model_name
+ self.device = device
+ model_type = "coco" if "coco" in model_name else "pretrain"
+ self.model = Blip2ITM.from_pretrained(model_type).to(self.device).float()
+ # print numbr of parameters
+ print(
+ f"Number of parameters: {sum(p.numel() for p in self.model.parameters())}"
+ )
+ self.processor = Blip2Processor.from_pretrained(model_name)
+
+ def preprocess(
+ self,
+ texts: list[str],
+ images: list[Image.Image],
+ ):
+ return self.processor(
+ text=texts, images=images, return_tensors="pt", padding=True
+ )
+
+ def get_text_embeddings(
+ self,
+ texts: list[str],
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_text_embeddings = []
+
+ with torch.no_grad():
+ for i in tqdm(range(0, len(texts), batch_size)):
+ batch_texts = texts[i : i + batch_size]
+ text_tokens = self.model.tokenizer(
+ batch_texts,
+ padding="max_length",
+ truncation=True,
+ max_length=self.model.max_txt_len,
+ return_tensors="pt",
+ ).to(self.device)
+ text_outputs = self.model.forward_text(text_tokens)
+ # text_outputs = normalize(self.model.text_proj(text_outputs))
+ all_text_embeddings.append(text_outputs.cpu())
+
+ all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
+ return all_text_embeddings
+
+ def get_image_embeddings(
+ self,
+ images: list[Image.Image] | DataLoader,
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_image_embeddings = []
+
+ if isinstance(images, DataLoader):
+ with torch.no_grad():
+ for batch in tqdm(images):
+ inputs = self.processor(
+ images=batch, return_tensors="pt", padding=True
+ )
+ image_outputs = self.model.forward_image(
+ inputs["pixel_values"].to(self.device)
+ )
+ image_outputs = image_outputs[0][:, 0, :]
+ # image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1)
+ all_image_embeddings.append(image_outputs.cpu())
+ else:
+ with torch.no_grad():
+ for i in tqdm(range(0, len(images), batch_size)):
+ batch_images = images[i : i + batch_size]
+ inputs = self.processor(
+ images=batch_images, return_tensors="pt", padding=True
+ )["pixel_values"].to(self.device)
+ image_outputs = self.model.forward_image(inputs)
+ image_outputs = image_outputs[0][:, 0, :]
+ # image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1)
+ all_image_embeddings.append(image_outputs.cpu())
+
+ all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
+ return all_image_embeddings
+
+ def get_multimodal_embeddings(self, texts, images, batch_size=32):
+ all_multimodal_embeddings = []
+
+ with torch.no_grad():
+ if isinstance(images, DataLoader):
+ # check dataloader batch size is the same as batch size
+ if images.batch_size != batch_size:
+ raise ValueError(
+ "Image DataLoader batch size must be the same as the given batch size: "
+ + str(batch_size)
+ )
+ for batch_images, i in tqdm(
+ zip(images, range(0, len(texts), batch_size))
+ ):
+ batch_texts = texts[i : i + batch_size]
+
+ image_inputs = self.processor(
+ images=batch_images, return_tensors="pt", padding=True
+ )["pixel_values"].to(self.device)
+ multimodal_outputs = self.model.extract_features(
+ {"text_input": batch_texts, "image": image_inputs}
+ ).multimodal_embeds[:, 0, :]
+
+ # multimodal_outputs = normalize(self.model.text_proj(multimodal_outputs), dim=-1)
+
+ all_multimodal_embeddings.append(multimodal_outputs.cpu())
+ else:
+ for i in tqdm(range(0, len(texts), batch_size)):
+ batch_images = images[i : i + batch_size]
+ batch_texts = texts[i : i + batch_size]
+
+ image_inputs = self.processor(
+ images=batch_images, return_tensors="pt", padding=True
+ )["pixel_values"].to(self.device)
+ multimodal_outputs = self.model.extract_features(
+ {"text_input": batch_texts, "image": image_inputs}
+ ).multimodal_embeds[:, 0, :]
+
+ # multimodal_outputs = normalize(self.model.text_proj(multimodal_outputs), dim=-1)
+
+ all_multimodal_embeddings.append(multimodal_outputs.cpu())
+
+ return torch.cat(all_multimodal_embeddings, dim=0)
+
+ def calculate_probs(self, text_embeddings, image_embeddings):
+ text_embeddings = text_embeddings / text_embeddings.norm(
+ dim=-1, keepdim=True
+ )
+ image_embeddings = image_embeddings / image_embeddings.norm(
+ dim=-1, keepdim=True
+ )
+ logits = torch.matmul(image_embeddings, text_embeddings.T)
+ probs = (logits * 100).softmax(dim=-1)
+ return probs
+
+ def get_fused_embeddings(
+ self,
+ texts: list[str] = None,
+ images: list[Image.Image] | DataLoader = None,
+ fusion_mode="sum",
+ **kwargs: Any,
+ ):
+ # TODO: find out if BLIP has a prescribed way of fusing text and image embeddings
+ if texts is None and images is None:
+ raise ValueError("Either texts or images must be provided")
+
+ text_embeddings = None
+ image_embeddings = None
+
+ if texts is not None:
+ text_embeddings = self.get_text_embeddings(texts, **kwargs)
+
+ if images is not None:
+ image_embeddings = self.get_image_embeddings(images, **kwargs)
+
+ if text_embeddings is not None and image_embeddings is not None:
+ if len(text_embeddings) != len(image_embeddings):
+ raise ValueError(
+ "The number of texts and images must have the same length"
+ )
+ if fusion_mode == "sum":
+ fused_embeddings = text_embeddings + image_embeddings
+ elif fusion_mode == "multimodal":
+ fused_embeddings = self.get_multimodal_embeddings(
+ texts, images, kwargs.get("batch_size", 32)
+ )
+ else:
+ # to do: add other fusion mode
+ raise ValueError(
+ f"fusion mode {fusion_mode} hasn't been implemented"
+ )
+ return fused_embeddings
+ elif text_embeddings is not None:
+ return text_embeddings
+ elif image_embeddings is not None:
+ return image_embeddings
+
+ return BLIP2ModelWrapper(**kwargs)
+
+
+blip2_training_datasets = {
+ # COCO
+ # CC3M+CC12M+SBU
+ # LAION400M
+}
+
+blip2_opt_2_7b = ModelMeta(
+ loader=partial(
+ blip2_loader,
+ model_name="Salesforce/blip2-opt-2.7b",
+ ),
+ name="Salesforce/blip2-opt-2.7b",
+ languages=["eng_Latn"],
+ revision="51572668da0eb669e01a189dc22abe6088589a24",
+ release_date="2024-03-22",
+ modalities=["image", "text"],
+ n_parameters=3_740_000_000,
+ max_tokens=None,
+ embed_dim=768,
+ license="mit",
+ open_weights=True,
+ public_training_code="https://github.com/salesforce/LAVIS/tree/main/projects/blip2",
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/Salesforce/blip2-opt-2.7b",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=blip2_training_datasets,
+)
+
+blip2_opt_6_7b_coco = ModelMeta(
+ loader=partial(
+ blip2_loader,
+ model_name="Salesforce/blip2-opt-6.7b-coco",
+ ),
+ name="Salesforce/blip2-opt-6.7b-coco",
+ languages=["eng_Latn"],
+ revision="0d580de59320a25a4d2c386387bcef310d5f286e",
+ release_date="2024-03-31",
+ modalities=["image", "text"],
+ n_parameters=7_750_000_000,
+ max_tokens=None,
+ embed_dim=768,
+ license="mit",
+ open_weights=True,
+ public_training_code="https://github.com/salesforce/LAVIS/tree/main/projects/blip2",
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/Salesforce/blip2-opt-6.7b-coco",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=blip2_training_datasets,
+)
diff --git a/mteb/models/blip_models.py b/mteb/models/blip_models.py
new file mode 100644
index 0000000000..43822465dc
--- /dev/null
+++ b/mteb/models/blip_models.py
@@ -0,0 +1,376 @@
+from __future__ import annotations
+
+from functools import partial
+from typing import Any
+
+import torch
+from PIL import Image
+from torch.nn.functional import normalize
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import BlipForImageTextRetrieval, BlipProcessor
+
+from mteb.encoder_interface import PromptType
+from mteb.model_meta import ModelMeta
+
+
+class BLIPModelWrapper:
+ def __init__(
+ self,
+ model_name: str,
+ device: str = "cuda" if torch.cuda.is_available() else "cpu",
+ **kwargs: Any,
+ ):
+ self.model_name = model_name
+ self.device = device
+ self.model = BlipForImageTextRetrieval.from_pretrained(model_name).to(
+ self.device
+ )
+ self.processor = BlipProcessor.from_pretrained(model_name)
+
+ def preprocess(
+ self,
+ texts: list[str],
+ images: list[Image.Image],
+ ):
+ return self.processor(
+ text=texts, images=images, return_tensors="pt", padding=True
+ )
+
+ def get_text_embeddings(
+ self,
+ texts: list[str],
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_text_embeddings = []
+
+ with torch.no_grad():
+ for i in tqdm(range(0, len(texts), batch_size)):
+ batch_texts = texts[i : i + batch_size]
+ inputs = self.processor(
+ text=batch_texts, return_tensors="pt", padding=True, truncation=True
+ )
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ # different to CLIPModelWrapper: text_encoder instead of get_text_features and apply projection and normalization
+ text_outputs = self.model.text_encoder(**inputs)
+ text_outputs = text_outputs[0]
+ text_outputs = normalize(
+ self.model.text_proj(text_outputs[:, 0, :]), dim=-1
+ )
+ all_text_embeddings.append(text_outputs.cpu())
+
+ all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
+ return all_text_embeddings
+
+ def get_image_embeddings(
+ self,
+ images: list[Image.Image] | DataLoader,
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_image_embeddings = []
+
+ if isinstance(images, DataLoader):
+ with torch.no_grad():
+ for batch in tqdm(images):
+ inputs = self.processor(
+ images=batch, return_tensors="pt", padding=True
+ )
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ image_outputs = self.model.vision_model(**inputs)
+ image_outputs = image_outputs[0]
+ image_outputs = normalize(
+ self.model.vision_proj(image_outputs[:, 0, :]), dim=-1
+ )
+ all_image_embeddings.append(image_outputs.cpu())
+ else:
+ with torch.no_grad():
+ for i in tqdm(range(0, len(images), batch_size)):
+ batch_images = images[i : i + batch_size]
+ inputs = self.processor(
+ images=batch_images, return_tensors="pt", padding=True
+ )
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ image_outputs = self.model.vision_model(**inputs)
+ image_outputs = image_outputs[0]
+ image_outputs = normalize(
+ self.model.vision_proj(image_outputs[:, 0, :]), dim=-1
+ )
+ all_image_embeddings.append(image_outputs.cpu())
+
+ all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
+ return all_image_embeddings
+
+ def calculate_probs(self, text_embeddings, image_embeddings):
+ text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
+ image_embeddings = image_embeddings / image_embeddings.norm(
+ dim=-1, keepdim=True
+ )
+ logits = torch.matmul(image_embeddings, text_embeddings.T)
+ probs = (logits * 100).softmax(dim=-1)
+ return probs
+
+ def get_fused_embeddings(
+ self,
+ texts: list[str] = None,
+ images: list[Image.Image] | DataLoader = None,
+ fusion_mode="sum",
+ **kwargs: Any,
+ ):
+ if texts is None and images is None:
+ raise ValueError("Either texts or images must be provided")
+
+ text_embeddings = None
+ image_embeddings = None
+
+ if texts is not None:
+ text_embeddings = self.get_text_embeddings(texts, **kwargs)
+
+ if images is not None:
+ image_embeddings = self.get_image_embeddings(images, **kwargs)
+
+ if text_embeddings is not None and image_embeddings is not None:
+ if len(text_embeddings) != len(image_embeddings):
+ raise ValueError(
+ "The number of texts and images must have the same length"
+ )
+ if fusion_mode == "sum":
+ fused_embeddings = text_embeddings + image_embeddings
+ else:
+ # to do: add other fusion mode
+ raise ValueError(f"fusion mode {fusion_mode} hasn't been implemented")
+ return fused_embeddings
+ elif text_embeddings is not None:
+ return text_embeddings
+ elif image_embeddings is not None:
+ return image_embeddings
+
+
+# in descending order of usage (downloads from huggingface)
+blip_image_captioning_large = ModelMeta(
+ loader=partial(
+ BLIPModelWrapper,
+ model_name="Salesforce/blip-image-captioning-large",
+ ),
+ name="Salesforce/blip-image-captioning-large",
+ languages=["eng_Latn"],
+ revision="2227ac38c9f16105cb0412e7cab4759978a8fd90",
+ release_date="2023-12-07",
+ modalities=["image", "text"],
+ n_parameters=470_000_000,
+ max_tokens=512,
+ embed_dim=768,
+ license="bsd-3-clause",
+ open_weights=True,
+ public_training_code="https://github.com/salesforce/BLIP",
+ public_training_data="https://github.com/salesforce/BLIP",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/Salesforce/blip-image-captioning-large",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets={
+ # COCO
+ # CC3M+CC12M+SBU
+ # LAION115M
+ },
+)
+
+blip_image_captioning_base = ModelMeta(
+ loader=partial(
+ BLIPModelWrapper,
+ model_name="Salesforce/blip-image-captioning-base",
+ ),
+ name="Salesforce/blip-image-captioning-base",
+ languages=["eng_Latn"],
+ revision="89b09ea1789f7addf2f6d6f0dfc4ce10ab58ef84",
+ release_date="2023-08-01",
+ modalities=["image", "text"],
+ n_parameters=247_000_000,
+ max_tokens=512,
+ embed_dim=768,
+ license="bsd-3-clause",
+ open_weights=True,
+ public_training_code="https://github.com/salesforce/BLIP",
+ public_training_data="https://github.com/salesforce/BLIP",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/Salesforce/blip-image-captioning-base",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets={
+ # COCO
+ # CC3M+CC12M+SBU
+ # LAION115M
+ },
+)
+
+
+blip_vqa_base = ModelMeta(
+ loader=partial(
+ BLIPModelWrapper,
+ model_name="Salesforce/blip-vqa-base",
+ ),
+ name="Salesforce/blip-vqa-base",
+ languages=["eng_Latn"],
+ revision="c7df8e7cd7aa2ee9af18f56e2b29e59a92651b64",
+ release_date="2023-12-07",
+ modalities=["image", "text"],
+ n_parameters=247_000_000,
+ max_tokens=512,
+ embed_dim=768,
+ license="bsd-3-clause",
+ open_weights=True,
+ public_training_code="https://github.com/salesforce/BLIP",
+ public_training_data="https://github.com/salesforce/BLIP",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/Salesforce/blip-vqa-base",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets={
+ # CC3M+CC12M+SBU
+ # LAION115M
+ },
+)
+
+blip_vqa_capfilt_large = ModelMeta(
+ loader=partial(
+ BLIPModelWrapper,
+ model_name="Salesforce/blip-vqa-capfilt-large",
+ ),
+ name="Salesforce/blip-vqa-capfilt-large",
+ languages=["eng_Latn"],
+ revision="e53f95265aeab69013fabb5380500ab984adbbb4",
+ release_date="2023-01-22",
+ modalities=["image", "text"],
+ n_parameters=247_000_000,
+ max_tokens=512,
+ embed_dim=768,
+ license="bsd-3-clause",
+ open_weights=True,
+ public_training_code="https://github.com/salesforce/BLIP",
+ public_training_data="https://github.com/salesforce/BLIP",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/Salesforce/blip-vqa-capfilt-large",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets={
+ # CC3M+CC12M+SBU
+ # LAION115M
+ },
+)
+
+blip_itm_base_coco = ModelMeta(
+ loader=partial(
+ BLIPModelWrapper,
+ model_name="Salesforce/blip-itm-base-coco",
+ ),
+ name="Salesforce/blip-itm-base-coco",
+ languages=["eng_Latn"],
+ revision="7eaa90c11850c0b17fc38c6a11e7d88bd6ac231f",
+ release_date="2023-08-01",
+ modalities=["image", "text"],
+ n_parameters=247_000_000,
+ max_tokens=512,
+ embed_dim=768,
+ license="bsd-3-clause",
+ open_weights=True,
+ public_training_code="https://github.com/salesforce/BLIP",
+ public_training_data="https://github.com/salesforce/BLIP",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/Salesforce/blip-itm-base-coco",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets={
+ # CC3M+CC12M+SBU
+ # LAION115M
+ },
+)
+
+blip_itm_large_coco = ModelMeta(
+ loader=partial(
+ BLIPModelWrapper,
+ model_name="Salesforce/blip-itm-large-coco",
+ ),
+ name="Salesforce/blip-itm-large-coco",
+ languages=["eng_Latn"],
+ revision="fef05cafc05298067cbbca00b125749394a77a6f",
+ release_date="2023-08-01",
+ modalities=["image", "text"],
+ n_parameters=470_000_000,
+ max_tokens=512,
+ embed_dim=768,
+ license="bsd-3-clause",
+ open_weights=True,
+ public_training_code="https://github.com/salesforce/BLIP",
+ public_training_data="https://github.com/salesforce/BLIP",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/Salesforce/blip-itm-large-coco",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets={
+ # COCO
+ # CC3M+CC12M+SBU
+ # LAION115M
+ },
+)
+
+blip_itm_base_flickr = ModelMeta(
+ loader=partial(
+ BLIPModelWrapper,
+ model_name="Salesforce/blip-itm-base-flickr",
+ ),
+ name="Salesforce/blip-itm-base-flickr",
+ languages=["eng_Latn"],
+ revision="1de29e660d91ae1786c1876212ea805a22eab251",
+ release_date="2023-08-01",
+ modalities=["image", "text"],
+ n_parameters=247_000_000,
+ max_tokens=512,
+ embed_dim=768,
+ license="bsd-3-clause",
+ open_weights=True,
+ public_training_code="https://github.com/salesforce/BLIP",
+ public_training_data="https://github.com/salesforce/BLIP",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/Salesforce/blip-itm-base-flickr",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets={
+ # CC3M+CC12M+SBU
+ # LAION115M
+ # Flickr30k
+ },
+)
+
+blip_itm_large_flickr = ModelMeta(
+ loader=partial(
+ BLIPModelWrapper,
+ model_name="Salesforce/blip-itm-large-flickr",
+ ),
+ name="Salesforce/blip-itm-large-flickr",
+ languages=["eng_Latn"],
+ revision="bda12e6506758f54261b5ab174b2c55a3ba143fb",
+ release_date="2023-08-01",
+ modalities=["image", "text"],
+ n_parameters=470_000_000,
+ max_tokens=512,
+ embed_dim=768,
+ license="bsd-3-clause",
+ open_weights=True,
+ public_training_code="https://github.com/salesforce/BLIP",
+ public_training_data="https://github.com/salesforce/BLIP",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/Salesforce/blip-itm-large-flickr",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets={
+ # CC3M+CC12M+SBU
+ # LAION115M
+ },
+)
diff --git a/mteb/models/clip_models.py b/mteb/models/clip_models.py
new file mode 100644
index 0000000000..faee0e7c9d
--- /dev/null
+++ b/mteb/models/clip_models.py
@@ -0,0 +1,210 @@
+from __future__ import annotations
+
+from functools import partial
+from typing import Any
+
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoModel, AutoProcessor
+
+from mteb.encoder_interface import PromptType
+from mteb.model_meta import ModelMeta
+
+
+class CLIPModelWrapper:
+ def __init__(
+ self,
+ model_name: str,
+ device: str = "cuda" if torch.cuda.is_available() else "cpu",
+ **kwargs: Any,
+ ):
+ self.model_name = model_name
+ self.device = device
+ self.model = AutoModel.from_pretrained(model_name).to(self.device)
+ self.processor = AutoProcessor.from_pretrained(model_name)
+
+ def preprocess(
+ self,
+ texts: list[str],
+ images: list[Image.Image],
+ ):
+ return self.processor(
+ text=texts, images=images, return_tensors="pt", padding=True
+ )
+
+ def get_text_embeddings(
+ self,
+ texts: list[str],
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_text_embeddings = []
+
+ with torch.no_grad():
+ for i in tqdm(range(0, len(texts), batch_size)):
+ batch_texts = texts[i : i + batch_size]
+ inputs = self.processor(
+ text=batch_texts, return_tensors="pt", padding=True, truncation=True
+ )
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ text_outputs = self.model.get_text_features(**inputs)
+ all_text_embeddings.append(text_outputs.cpu())
+
+ all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
+ return all_text_embeddings
+
+ def get_image_embeddings(
+ self,
+ images: list[Image.Image] | DataLoader,
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_image_embeddings = []
+
+ if isinstance(images, DataLoader):
+ with torch.no_grad():
+ for batch in tqdm(images):
+ inputs = self.processor(
+ images=batch, return_tensors="pt", padding=True
+ )
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ image_outputs = self.model.get_image_features(**inputs)
+ all_image_embeddings.append(image_outputs.cpu())
+ else:
+ with torch.no_grad():
+ for i in tqdm(range(0, len(images), batch_size)):
+ batch_images = images[i : i + batch_size]
+ inputs = self.processor(
+ images=batch_images, return_tensors="pt", padding=True
+ )
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ image_outputs = self.model.get_image_features(**inputs)
+ all_image_embeddings.append(image_outputs.cpu())
+
+ all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
+ return all_image_embeddings
+
+ def calculate_probs(self, text_embeddings, image_embeddings):
+ text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
+ image_embeddings = image_embeddings / image_embeddings.norm(
+ dim=-1, keepdim=True
+ )
+ logits = torch.matmul(image_embeddings, text_embeddings.T)
+ probs = (logits * 100).softmax(dim=-1)
+ return probs
+
+ def get_fused_embeddings(
+ self,
+ texts: list[str] = None,
+ images: list[Image.Image] | DataLoader = None,
+ fusion_mode="sum",
+ **kwargs: Any,
+ ):
+ if texts is None and images is None:
+ raise ValueError("Either texts or images must be provided")
+
+ text_embeddings = None
+ image_embeddings = None
+
+ if texts is not None:
+ text_embeddings = self.get_text_embeddings(texts, **kwargs)
+
+ if images is not None:
+ image_embeddings = self.get_image_embeddings(images, **kwargs)
+
+ if text_embeddings is not None and image_embeddings is not None:
+ if len(text_embeddings) != len(image_embeddings):
+ raise ValueError(
+ "The number of texts and images must have the same length"
+ )
+ if fusion_mode == "sum":
+ fused_embeddings = text_embeddings + image_embeddings
+ else:
+ # to do: add other fusion mode
+ raise ValueError(f"fusion mode {fusion_mode} hasn't been implemented")
+ return fused_embeddings
+ elif text_embeddings is not None:
+ return text_embeddings
+ elif image_embeddings is not None:
+ return image_embeddings
+
+
+clip_vit_large_patch14 = ModelMeta(
+ loader=partial(
+ CLIPModelWrapper,
+ model_name="openai/clip-vit-large-patch14",
+ ),
+ name="openai/clip-vit-large-patch14",
+ languages=["eng_Latn"],
+ revision="32bd64288804d66eefd0ccbe215aa642df71cc41",
+ release_date="2021-02-26",
+ modalities=["image", "text"],
+ n_parameters=428_000_000,
+ max_tokens=77,
+ embed_dim=768,
+ license=None,
+ open_weights=True,
+ public_training_code=None,
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/openai/clip-vit-large-patch14",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=None,
+)
+
+clip_vit_base_patch32 = ModelMeta(
+ loader=partial(
+ CLIPModelWrapper,
+ model_name="openai/clip-vit-base-patch32",
+ ),
+ name="openai/clip-vit-base-patch32",
+ languages=["eng_Latn"],
+ revision="3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268",
+ release_date="2021-02-26",
+ modalities=["image", "text"],
+ n_parameters=151_000_000,
+ max_tokens=77,
+ embed_dim=512,
+ license=None,
+ open_weights=True,
+ public_training_code=None,
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/openai/clip-vit-base-patch32",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=None,
+)
+
+clip_vit_base_patch16 = ModelMeta(
+ loader=partial(
+ CLIPModelWrapper,
+ model_name="openai/clip-vit-base-patch16",
+ ),
+ name="openai/clip-vit-base-patch16",
+ languages=["eng_Latn"],
+ revision="57c216476eefef5ab752ec549e440a49ae4ae5f3",
+ release_date="2021-02-26",
+ modalities=["image", "text"],
+ n_parameters=151_000_000,
+ max_tokens=77,
+ embed_dim=512,
+ license=None,
+ open_weights=True,
+ public_training_code=None,
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/openai/clip-vit-base-patch16",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=None,
+)
diff --git a/mteb/models/cohere_v.py b/mteb/models/cohere_v.py
new file mode 100644
index 0000000000..c84d5ff640
--- /dev/null
+++ b/mteb/models/cohere_v.py
@@ -0,0 +1,223 @@
+from __future__ import annotations
+
+import base64
+import io
+import os
+import time
+from functools import partial
+from typing import Any
+
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+from torchvision import transforms
+from tqdm import tqdm
+
+from mteb.encoder_interface import PromptType
+from mteb.model_meta import ModelMeta
+
+api_key = os.getenv("COHERE_API_KEY")
+tensor_to_image = transforms.Compose([transforms.ToPILImage()])
+
+
+def cohere_v_loader(**kwargs):
+ try:
+ import cohere
+ except ImportError:
+ raise ImportError("To use cohere models, please run `pip install cohere`.")
+
+ class CohereMultiModalModelWrapper:
+ def __init__(
+ self,
+ model_name: str,
+ **kwargs: Any,
+ ):
+ self.model_name = model_name
+ self.client = cohere.ClientV2(api_key)
+ self.image_format = "JPEG"
+ """ Wrapper for Cohere multimodal embedding model,
+
+ do `export COHERE_API_KEY=` before running eval scripts.
+ Cohere currently supports 40 images/min, thus time.sleep(1.5) is applied after each image.
+ Remove or adjust this after Cohere API changes capacity.
+ """
+
+ def get_text_embeddings(
+ self,
+ texts: list[str],
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_text_embeddings = []
+
+ for i in tqdm(range(0, len(texts), batch_size)):
+ batch_texts = texts[i : i + batch_size]
+ response = self.client.embed(
+ texts=batch_texts,
+ model=self.model_name,
+ input_type="search_document",
+ )
+ all_text_embeddings.append(torch.tensor(response.embeddings.float))
+
+ all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
+ return all_text_embeddings
+
+ def get_image_embeddings(
+ self,
+ images: list[Image.Image] | DataLoader,
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_image_embeddings = []
+
+ if isinstance(images, DataLoader):
+ for batch in tqdm(images):
+ for image in batch:
+ # cohere only supports 1 image per call
+ buffered = io.BytesIO()
+ image = tensor_to_image(image)
+ image.save(buffered, format=self.image_format)
+ image_bytes = buffered.getvalue()
+ stringified_buffer = base64.b64encode(image_bytes).decode(
+ "utf-8"
+ )
+ content_type = f"image/{self.image_format.lower()}"
+ image_base64 = (
+ f"data:{content_type};base64,{stringified_buffer}"
+ )
+ response = self.client.embed(
+ model=self.model_name,
+ input_type="image",
+ embedding_types=["float"],
+ images=[image_base64],
+ )
+ all_image_embeddings.append(
+ torch.tensor(response.embeddings.float)
+ )
+ time.sleep(1.5)
+ else:
+ for i in tqdm(range(0, len(images), batch_size)):
+ batch_images = images[i : i + batch_size]
+ for image in batch_images:
+ # cohere only supports 1 image per call
+ buffered = io.BytesIO()
+ image.save(buffered, format=self.image_format)
+ image_bytes = buffered.getvalue()
+ stringified_buffer = base64.b64encode(image_bytes).decode(
+ "utf-8"
+ )
+ content_type = f"image/{self.image_format.lower()}"
+ image_base64 = (
+ f"data:{content_type};base64,{stringified_buffer}"
+ )
+ response = self.client.embed(
+ model=self.model_name,
+ input_type="image",
+ embedding_types=["float"],
+ images=[image_base64],
+ )
+ all_image_embeddings.append(
+ torch.tensor(response.embeddings.float)
+ )
+ time.sleep(1.5)
+ all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
+ return all_image_embeddings
+
+ def calculate_probs(self, text_embeddings, image_embeddings):
+ text_embeddings = text_embeddings / text_embeddings.norm(
+ dim=-1, keepdim=True
+ )
+ image_embeddings = image_embeddings / image_embeddings.norm(
+ dim=-1, keepdim=True
+ )
+ logits = torch.matmul(image_embeddings, text_embeddings.T)
+ probs = (logits * 100).softmax(dim=-1)
+ return probs
+
+ def get_fused_embeddings(
+ self,
+ texts: list[str] = None,
+ images: list[Image.Image] | DataLoader = None,
+ fusion_mode="sum",
+ **kwargs: Any,
+ ):
+ if texts is None and images is None:
+ raise ValueError("Either texts or images must be provided")
+
+ text_embeddings = None
+ image_embeddings = None
+
+ if texts is not None:
+ text_embeddings = self.get_text_embeddings(texts, **kwargs)
+
+ if images is not None:
+ image_embeddings = self.get_image_embeddings(images, **kwargs)
+
+ if text_embeddings is not None and image_embeddings is not None:
+ if len(text_embeddings) != len(image_embeddings):
+ raise ValueError(
+ "The number of texts and images must have the same length"
+ )
+ if fusion_mode == "sum":
+ fused_embeddings = text_embeddings + image_embeddings
+ else:
+ # to do: add other fusion mode
+ raise ValueError(
+ f"fusion mode {fusion_mode} hasn't been implemented"
+ )
+ return fused_embeddings
+ elif text_embeddings is not None:
+ return text_embeddings
+ elif image_embeddings is not None:
+ return image_embeddings
+
+ return CohereMultiModalModelWrapper(**kwargs)
+
+
+cohere_mult_3 = ModelMeta(
+ loader=partial(cohere_v_loader, model_name="embed-multilingual-v3.0"),
+ name="embed-multilingual-v3.0-v",
+ languages=[], # Unknown, but support >100 languages
+ revision="1",
+ release_date="2024-10-24",
+ n_parameters=None,
+ max_tokens=None,
+ embed_dim=1024,
+ license=None,
+ similarity_fn_name="cosine",
+ framework=[],
+ modalities=["image", "text"],
+ open_weights=False,
+ public_training_code=None,
+ public_training_data=None,
+ reference="https://huggingface.co/Cohere/Cohere-embed-multilingual-v3.0",
+ use_instructions=False,
+ training_datasets=None,
+)
+
+cohere_eng_3 = ModelMeta(
+ loader=partial(cohere_v_loader, model_name="embed-english-v3.0"),
+ name="embed-english-v3.0-v",
+ languages=["eng-Latn"],
+ revision="1",
+ release_date="2024-10-24",
+ n_parameters=None,
+ max_tokens=None,
+ embed_dim=1024,
+ license=None,
+ similarity_fn_name="cosine",
+ framework=[],
+ modalities=["image", "text"],
+ open_weights=False,
+ public_training_code=None,
+ public_training_data=None,
+ reference="https://huggingface.co/Cohere/Cohere-embed-english-v3.0",
+ use_instructions=False,
+ training_datasets=None,
+)
diff --git a/mteb/models/dino_models.py b/mteb/models/dino_models.py
new file mode 100644
index 0000000000..c2cd4db5fe
--- /dev/null
+++ b/mteb/models/dino_models.py
@@ -0,0 +1,223 @@
+from __future__ import annotations
+
+from functools import partial
+from typing import Any
+
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoImageProcessor, AutoModel
+
+from mteb.encoder_interface import PromptType
+from mteb.model_meta import ModelMeta
+
+
+class DINOModelWrapper:
+ """A wrapper class for DINO models that supports image encoding.
+ Text encoding and text-image fusion are not supported.
+ """
+
+ def __init__(
+ self,
+ model_name: str,
+ device: str = "cuda" if torch.cuda.is_available() else "cpu",
+ **kwargs: Any,
+ ):
+ self.model_name = model_name
+ self.device = device
+ self.model = AutoModel.from_pretrained(model_name).to(self.device)
+ self.processor = AutoImageProcessor.from_pretrained(model_name)
+
+ @staticmethod
+ def get_text_embeddings(
+ texts: list[str],
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ raise ValueError("DINO models only support image encoding.")
+
+ def get_image_embeddings(
+ self,
+ images: list[Image.Image] | DataLoader,
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ pooling="cls",
+ **kwargs: Any,
+ ):
+ all_image_embeddings = []
+
+ if isinstance(images, DataLoader):
+ with torch.no_grad():
+ for batch in tqdm(images):
+ inputs = self.processor(images=batch, return_tensors="pt")
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ image_outputs = self.model(**inputs)
+ features = image_outputs.last_hidden_state
+ if pooling == "cls":
+ features = features[:, 0, :] # TODO: confirm best practice
+ elif pooling == "mean":
+ features = features.mean(dim=1)
+ else:
+ raise ValueError(
+ "Pooling methods not implemented. Use cls or mean."
+ )
+ all_image_embeddings.append(features.cpu())
+ else:
+ with torch.no_grad():
+ for i in tqdm(range(0, len(images), batch_size)):
+ batch_images = images[i : i + batch_size]
+ inputs = self.processor(images=batch_images, return_tensors="pt")
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ image_outputs = self.model(**inputs)
+ features = image_outputs.last_hidden_state
+ if pooling == "cls":
+ features = features[:, 0, :]
+ elif pooling == "mean":
+ features = features.mean(dim=1)
+ else:
+ raise ValueError(
+ "Pooling methods not implemented. Use cls or mean."
+ )
+ all_image_embeddings.append(features.cpu())
+
+ all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
+ return all_image_embeddings
+
+ @staticmethod
+ def calculate_probs(text_embeddings, image_embeddings):
+ raise ValueError("DINO models only support image encoding.")
+
+ def get_fused_embeddings(
+ self,
+ texts: list[str] = None,
+ images: list[Image.Image] | DataLoader = None,
+ fusion_mode="sum",
+ **kwargs: Any,
+ ):
+ if texts is None and images is None:
+ raise ValueError("images must be provided for DINO models")
+
+ text_embeddings = None
+ image_embeddings = None
+
+ if texts is not None:
+ text_embeddings = self.get_text_embeddings(texts, **kwargs)
+
+ if images is not None:
+ image_embeddings = self.get_image_embeddings(images, **kwargs)
+
+ if text_embeddings is not None and image_embeddings is not None:
+ raise ValueError("DINO models only support image encoding.")
+ elif text_embeddings is not None:
+ return text_embeddings
+ elif image_embeddings is not None:
+ return image_embeddings
+
+
+dinov2_training_datasets = {
+ # LVD-142M
+ # ImageNet-22k
+}
+
+
+dinov2_small = ModelMeta(
+ loader=partial(
+ DINOModelWrapper,
+ model_name="facebook/dinov2-small",
+ ),
+ name="facebook/dinov2-small",
+ languages=["eng_Latn"],
+ revision="ed25f3a31f01632728cabb09d1542f84ab7b0056",
+ release_date="2023-07-18",
+ modalities=["image"],
+ n_parameters=22_100_000,
+ max_tokens=None,
+ embed_dim=384,
+ license="apache-2.0",
+ open_weights=True,
+ public_training_code="https://github.com/facebookresearch/dinov2",
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/facebook/dinov2-small",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=dinov2_training_datasets,
+)
+
+dinov2_base = ModelMeta(
+ loader=partial(
+ DINOModelWrapper,
+ model_name="facebook/dinov2-base",
+ ),
+ name="facebook/dinov2-base",
+ languages=["eng_Latn"],
+ revision="f9e44c814b77203eaa57a6bdbbd535f21ede1415",
+ release_date="2023-07-18",
+ modalities=["image"],
+ n_parameters=86_600_000,
+ max_tokens=None,
+ embed_dim=768,
+ license="apache-2.0",
+ open_weights=True,
+ public_training_code="https://github.com/facebookresearch/dinov2",
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/facebook/dinov2-base",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=dinov2_training_datasets,
+)
+
+dinov2_large = ModelMeta(
+ loader=partial(
+ DINOModelWrapper,
+ model_name="facebook/dinov2-large",
+ ),
+ name="facebook/dinov2-large",
+ languages=["eng_Latn"],
+ revision="47b73eefe95e8d44ec3623f8890bd894b6ea2d6c",
+ release_date="2023-07-18",
+ modalities=["image"],
+ n_parameters=304_000_000,
+ max_tokens=None,
+ embed_dim=1024,
+ license="apache-2.0",
+ open_weights=True,
+ public_training_code="https://github.com/facebookresearch/dinov2",
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/facebook/dinov2-large",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=dinov2_training_datasets,
+)
+
+dinov2_giant = ModelMeta(
+ loader=partial(
+ DINOModelWrapper,
+ model_name="facebook/dinov2-giant",
+ ),
+ name="facebook/dinov2-giant",
+ languages=["eng_Latn"],
+ revision="611a9d42f2335e0f921f1e313ad3c1b7178d206d",
+ release_date="2023-07-18",
+ modalities=["image"],
+ n_parameters=1_140_000_000,
+ max_tokens=None,
+ embed_dim=1536,
+ license="apache-2.0",
+ open_weights=True,
+ public_training_code="https://github.com/facebookresearch/dinov2",
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/facebook/dinov2-giant",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=dinov2_training_datasets,
+)
diff --git a/mteb/models/e5_v.py b/mteb/models/e5_v.py
new file mode 100644
index 0000000000..909cfcbab7
--- /dev/null
+++ b/mteb/models/e5_v.py
@@ -0,0 +1,215 @@
+from __future__ import annotations
+
+from functools import partial
+from typing import Any
+
+import torch
+import transformers
+from packaging import version
+from PIL import Image
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor
+
+from mteb.encoder_interface import PromptType
+from mteb.model_meta import ModelMeta
+
+E5_V_TRANSFORMERS_VERSION = (
+ "4.44.2" # Issue 1647: Only works with transformers==4.44.2.
+)
+
+
+class E5VWrapper:
+ def __init__(
+ self,
+ model_name: str,
+ composed_prompt=None,
+ **kwargs: Any,
+ ):
+ if version.parse(transformers.__version__) > version.parse(
+ E5_V_TRANSFORMERS_VERSION
+ ):
+ raise ImportError(
+ f"This wrapper only works with transformers=={E5_V_TRANSFORMERS_VERSION}"
+ )
+
+ self.model_name = model_name
+ self.processor = LlavaNextProcessor.from_pretrained(model_name)
+ if "device" in kwargs:
+ self.device = kwargs.pop("device")
+ self.model = LlavaNextForConditionalGeneration.from_pretrained(
+ model_name, **kwargs
+ )
+ self.model.eval()
+ self.template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"
+ self.text_prompt = self.template.format(
+ "\nSummary above sentence in one word: "
+ )
+ self.img_prompt = self.template.format(
+ "\nSummary above image in one word: "
+ )
+ if not composed_prompt:
+ # default composed embedding, to_do: move it to get_fused_embedding with "prompt_name" like MTEB text ones.
+ self.composed_prompt = self.template.format(
+ '[INST] Modify this image with "{}" Describe modified image in one word: [/INST]'
+ )
+ else:
+ self.composed_prompt = self.template.format(composed_prompt)
+
+ def get_text_embeddings(
+ self,
+ texts: list[str],
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 8,
+ **kwargs: Any,
+ ):
+ all_text_embeddings = []
+
+ with torch.no_grad():
+ for i in tqdm(range(0, len(texts), batch_size)):
+ batch_texts = texts[i : i + batch_size]
+ text_inputs = self.processor(
+ [self.text_prompt.replace("", text) for text in batch_texts],
+ return_tensors="pt",
+ padding=True,
+ ).to("cuda")
+ text_outputs = self.model(
+ **text_inputs, output_hidden_states=True, return_dict=True
+ ).hidden_states[-1][:, -1, :]
+ all_text_embeddings.append(text_outputs.cpu())
+ return torch.cat(all_text_embeddings, dim=0)
+
+ def get_image_embeddings(
+ self,
+ images: list[Image.Image] | DataLoader,
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 8,
+ **kwargs: Any,
+ ):
+ all_image_embeddings = []
+
+ with torch.no_grad():
+ if isinstance(images, DataLoader):
+ for batch_images in tqdm(images):
+ img_inputs = self.processor(
+ [self.img_prompt] * len(batch_images),
+ batch_images,
+ return_tensors="pt",
+ padding=True,
+ ).to("cuda")
+ image_outputs = self.model(
+ **img_inputs, output_hidden_states=True, return_dict=True
+ ).hidden_states[-1][:, -1, :]
+ all_image_embeddings.append(image_outputs.cpu())
+ else:
+ for i in tqdm(range(0, len(images), batch_size)):
+ batch_images = images[i : i + batch_size]
+ img_inputs = self.processor(
+ [self.img_prompt] * len(batch_images),
+ batch_images,
+ return_tensors="pt",
+ padding=True,
+ ).to("cuda")
+ image_outputs = self.model(
+ **img_inputs, output_hidden_states=True, return_dict=True
+ ).hidden_states[-1][:, -1, :]
+ all_image_embeddings.append(image_outputs.cpu())
+ return torch.cat(all_image_embeddings, dim=0)
+
+ def calculate_probs(self, text_embeddings, image_embeddings):
+ text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
+ image_embeddings = image_embeddings / image_embeddings.norm(
+ dim=-1, keepdim=True
+ )
+ logits = torch.matmul(image_embeddings, text_embeddings.T)
+ probs = (logits * 100).softmax(dim=-1)
+ return probs
+
+ def get_fused_embeddings(
+ self,
+ texts: list[str] = None,
+ images: list[Image.Image] = None,
+ batch_size: int = 8,
+ **kwargs: Any,
+ ):
+ if texts is None and images is None:
+ raise ValueError("Either texts or images must be provided")
+
+ all_fused_embeddings = []
+ kwargs.update(batch_size=batch_size)
+
+ if texts is not None and images is not None:
+ with torch.no_grad():
+ if isinstance(images, DataLoader):
+ for index, batch_images in enumerate(tqdm(images)):
+ batch_texts = texts[
+ index * batch_size : (index + 1) * batch_size
+ ]
+ prompts = [
+ self.composed_prompt.format(text) for text in batch_texts
+ ]
+ inputs = self.processor(
+ prompts, batch_images, return_tensors="pt", padding=True
+ ).to("cuda")
+ outputs = self.model(
+ **inputs, output_hidden_states=True, return_dict=True
+ ).hidden_states[-1][:, -1, :]
+ all_fused_embeddings.append(outputs.cpu())
+ else:
+ if len(texts) != len(images):
+ raise ValueError(
+ "The number of texts and images must have the same length"
+ )
+ for i in tqdm(range(0, len(images), batch_size)):
+ batch_texts = texts[i : i + batch_size]
+ batch_images = images[i : i + batch_size]
+ prompts = [
+ self.composed_prompt.format(text) for text in batch_texts
+ ]
+ inputs = self.processor(
+ prompts, batch_images, return_tensors="pt", padding=True
+ ).to("cuda")
+ outputs = self.model(
+ **inputs, output_hidden_states=True, return_dict=True
+ ).hidden_states[-1][:, -1, :]
+ all_fused_embeddings.append(outputs.cpu())
+ return torch.cat(all_fused_embeddings, dim=0)
+ elif texts is not None:
+ text_embeddings = self.get_text_embeddings(texts, **kwargs)
+ return text_embeddings
+ elif images is not None:
+ image_embeddings = self.get_image_embeddings(images, **kwargs)
+ return image_embeddings
+
+
+e5_v = ModelMeta(
+ loader=partial(
+ E5VWrapper,
+ model_name="royokong/e5-v",
+ torch_dtype=torch.float16,
+ device_map="auto",
+ ),
+ name="royokong/e5-v",
+ languages=["eng_Latn"],
+ revision="0c1f22679417b3ae925d779442221c40cd1861ab",
+ release_date="2024-07-17",
+ modalities=["image", "text"],
+ n_parameters=8_360_000_000,
+ max_tokens=8192,
+ embed_dim=4096,
+ license=None,
+ open_weights=True,
+ public_training_code="https://github.com/kongds/E5-V",
+ public_training_data="https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/royokong/e5-v",
+ similarity_fn_name=None,
+ use_instructions=True,
+ training_datasets={
+ # princeton-nlp/datasets-for-simcse
+ },
+)
diff --git a/mteb/models/evaclip_models.py b/mteb/models/evaclip_models.py
new file mode 100644
index 0000000000..fdd25771d4
--- /dev/null
+++ b/mteb/models/evaclip_models.py
@@ -0,0 +1,272 @@
+from __future__ import annotations
+
+from functools import partial
+from typing import Any
+
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from mteb.encoder_interface import PromptType
+from mteb.model_meta import ModelMeta
+
+
+def evaclip_loader(**kwargs):
+ try:
+ import os
+ import sys
+
+ sys.path.insert(0, os.path.join(os.getcwd(), "EVA/EVA-CLIP/rei"))
+
+ from eva_clip import create_model_and_transforms, get_tokenizer
+ except ImportError:
+ # https://github.com/baaivision/EVA/tree/master/EVA-CLIP#setup
+ raise ImportError(
+ "Please run `git clone git@github.com:baaivision/EVA.git`,"
+ "`pip install ninja timm`"
+ "`pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers`"
+ "`git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --disable-pip-version-check --no-build-isolation --no-cache-dir ./`"
+ )
+
+ class EvaCLIPWrapper:
+ def __init__(
+ self,
+ model_name: str = "EVA02-CLIP-B-16",
+ device: str = "cuda" if torch.cuda.is_available() else "cpu",
+ **kwargs: Any,
+ ):
+ self.model_name = model_name
+ self.device = device
+ pretrained = "eva_clip" # or "/path/to/EVA02_CLIP_B_psz16_s8B.pt"
+ self.model, _, self.img_preprocess = create_model_and_transforms(
+ model_name, pretrained, force_custom_clip=True, device=device
+ )
+ self.model.eval()
+ self.tokenizer = get_tokenizer(model_name)
+
+ def encode( # type: ignore
+ self,
+ sentences: list[str],
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ return self.get_text_embeddings(texts=sentences, batch_size=batch_size)
+
+ def get_text_embeddings(
+ self,
+ texts: list[str],
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_text_embeddings = []
+
+ with torch.no_grad(), torch.cuda.amp.autocast():
+ for i in tqdm(range(0, len(texts), batch_size)):
+ batch_texts = texts[i : i + batch_size]
+ inputs = self.tokenizer(batch_texts)
+ text_outputs = self.model.encode_text(inputs.to(self.device))
+ all_text_embeddings.append(text_outputs.cpu())
+
+ all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
+ return all_text_embeddings
+
+ def get_image_embeddings(
+ self,
+ images: list[Image.Image] | DataLoader,
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_image_embeddings = []
+ if isinstance(images, DataLoader):
+ import torchvision.transforms.functional as F
+
+ with torch.no_grad(), torch.cuda.amp.autocast():
+ for batch in tqdm(images):
+ # import pdb; pdb.set_trace()
+ inputs = torch.vstack(
+ [
+ self.img_preprocess(F.to_pil_image(b)).unsqueeze(0)
+ for b in batch
+ ]
+ )
+ image_outputs = self.model.encode_image(inputs.to(self.device))
+ all_image_embeddings.append(image_outputs.cpu())
+ else:
+ with torch.no_grad(), torch.cuda.amp.autocast():
+ for i in tqdm(range(0, len(images), batch_size)):
+ batch_images = images[i : i + batch_size]
+ inputs = torch.vstack(
+ [self.img_preprocess(b).unsqueeze(0) for b in batch_images]
+ )
+ image_outputs = self.model.encode_image(inputs.to(self.device))
+ all_image_embeddings.append(image_outputs.cpu())
+
+ all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
+ return all_image_embeddings
+
+ def calculate_probs(self, text_embeddings, image_embeddings):
+ text_embeddings = text_embeddings / text_embeddings.norm(
+ dim=-1, keepdim=True
+ )
+ image_embeddings = image_embeddings / image_embeddings.norm(
+ dim=-1, keepdim=True
+ )
+ logits = torch.matmul(image_embeddings, text_embeddings.T)
+ probs = (logits * 100).softmax(dim=-1)
+ return probs
+
+ def get_fused_embeddings(
+ self,
+ texts: list[str] = None,
+ images: list[Image.Image] | DataLoader = None,
+ fusion_mode="sum",
+ **kwargs: Any,
+ ):
+ if texts is None and images is None:
+ raise ValueError("Either texts or images must be provided")
+
+ text_embeddings = None
+ image_embeddings = None
+
+ if texts is not None:
+ text_embeddings = self.get_text_embeddings(texts, **kwargs)
+
+ if images is not None:
+ image_embeddings = self.get_image_embeddings(images, **kwargs)
+
+ if text_embeddings is not None and image_embeddings is not None:
+ if len(text_embeddings) != len(image_embeddings):
+ raise ValueError(
+ "The number of texts and images must have the same length"
+ )
+ if fusion_mode == "sum":
+ fused_embeddings = text_embeddings + image_embeddings
+ else:
+ # to do: add other fusion mode
+ raise ValueError(
+ f"fusion mode {fusion_mode} hasn't been implemented"
+ )
+ return fused_embeddings
+ elif text_embeddings is not None:
+ return text_embeddings
+ elif image_embeddings is not None:
+ return image_embeddings
+
+ return EvaCLIPWrapper(**kwargs)
+
+
+training_code = "https://github.com/baaivision/EVA/tree/master/EVA-CLIP"
+training_datasets = {
+ # COYO-700M, random sample 400M. https://github.com/kakaobrain/coyo-dataset
+ # LAION-2B, random sample 1.6B. https://laion.ai/blog/laion-5b/
+}
+laion_2b = {
+ # LAION-2B
+}
+
+EVA02_CLIP_B_16 = ModelMeta(
+ loader=partial(
+ evaclip_loader,
+ model_name="EVA02-CLIP-B-16",
+ ),
+ name="EVA02-CLIP-B-16",
+ languages=["eng_Latn"],
+ revision="11afd202f2ae80869d6cef18b1ec775e79bd8d12",
+ release_date="2023-04-26",
+ modalities=["image", "text"],
+ n_parameters=149_000_000,
+ max_tokens=77,
+ embed_dim=512,
+ license="mit",
+ open_weights=True,
+ public_training_code=training_code,
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/QuanSun/EVA-CLIP",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=training_datasets,
+)
+
+EVA02_CLIP_L_14 = ModelMeta(
+ loader=partial(
+ evaclip_loader,
+ model_name="EVA02-CLIP-L-14",
+ ),
+ name="EVA02-CLIP-L-14",
+ languages=["eng_Latn"],
+ revision="11afd202f2ae80869d6cef18b1ec775e79bd8d12",
+ release_date="2023-04-26",
+ modalities=["image", "text"],
+ n_parameters=428_000_000,
+ max_tokens=77,
+ embed_dim=768,
+ license="mit",
+ open_weights=True,
+ public_training_code=training_code,
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/QuanSun/EVA-CLIP",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=training_datasets,
+)
+
+EVA02_CLIP_bigE_14 = ModelMeta(
+ loader=partial(
+ evaclip_loader,
+ model_name="EVA02-CLIP-bigE-14",
+ ),
+ name="EVA02-CLIP-bigE-14",
+ languages=["eng_Latn"],
+ revision="11afd202f2ae80869d6cef18b1ec775e79bd8d12",
+ release_date="2023-04-26",
+ modalities=["image", "text"],
+ n_parameters=4_700_000_000,
+ max_tokens=77,
+ embed_dim=1024,
+ license="mit",
+ open_weights=True,
+ public_training_code=training_code,
+ public_training_data="https://laion.ai/blog/laion-5b/",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/QuanSun/EVA-CLIP",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=laion_2b,
+)
+
+
+EVA02_CLIP_bigE_14_plus = ModelMeta(
+ loader=partial(
+ evaclip_loader,
+ model_name="EVA02-CLIP-bigE-14-plus",
+ ),
+ name="EVA02-CLIP-bigE-14-plus",
+ languages=["eng_Latn"],
+ revision="11afd202f2ae80869d6cef18b1ec775e79bd8d12",
+ release_date="2023-04-26",
+ modalities=["image", "text"],
+ n_parameters=5_000_000_000,
+ max_tokens=77,
+ embed_dim=1024,
+ license="mit",
+ open_weights=True,
+ public_training_code=training_code,
+ public_training_data="https://laion.ai/blog/laion-5b/",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/QuanSun/EVA-CLIP",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=laion_2b,
+)
diff --git a/mteb/models/gme_v_models.py b/mteb/models/gme_v_models.py
new file mode 100644
index 0000000000..14812c0859
--- /dev/null
+++ b/mteb/models/gme_v_models.py
@@ -0,0 +1,475 @@
+from __future__ import annotations
+
+import logging
+import math
+import os
+from functools import partial
+from typing import Any
+
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+from tqdm.autonotebook import tqdm
+from transformers import AutoModelForVision2Seq, AutoProcessor
+
+import mteb
+from mteb.encoder_interface import PromptType
+from mteb.model_meta import ModelMeta
+
+from .instructions import DEFAULT_PROMPTS, TASKNAME2INSTRUCTIONS
+
+logging.basicConfig(level=logging.WARNING)
+logger = logging.getLogger(__name__)
+
+HF_GME_QWEN2VL_2B = "Alibaba-NLP/gme-Qwen2-VL-2B-Instruct"
+HF_GME_QWEN2VL_7B = "Alibaba-NLP/gme-Qwen2-VL-7B-Instruct"
+
+
+def get_gme_instruction(task_name: str, is_query: bool = True) -> str:
+ # TODO Prompts for other multimodal tasks.
+ if task_name in TASKNAME2INSTRUCTIONS:
+ prompt = TASKNAME2INSTRUCTIONS[task_name]
+ if isinstance(prompt, tuple):
+ prompt = prompt[0] if is_query else prompt[1]
+ else:
+ meta = mteb.get_task(task_name).metadata
+ prompt = DEFAULT_PROMPTS.get(meta.type, None)
+
+ if isinstance(prompt, str) and prompt[-1] != ".":
+ prompt += "."
+ return prompt
+
+
+class Encoder(torch.nn.Module):
+ def __init__(
+ self,
+ base,
+ processor,
+ max_length=1800,
+ normalize=True,
+ ) -> None:
+ super().__init__()
+ self.base = base
+ self.processor = processor
+ self.max_length = max_length
+ self.normalize = normalize
+ self.processor.tokenizer.padding_side = "right"
+ self.defualt_instruction = "You are a helpful assistant."
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor | None = None,
+ attention_mask: torch.Tensor | None = None,
+ position_ids: torch.LongTensor | None = None,
+ past_key_values: list[torch.FloatTensor] | None = None,
+ inputs_embeds: torch.FloatTensor | None = None,
+ pixel_values: torch.Tensor | None = None,
+ # pixel_values_videos: torch.FloatTensor | None = None,
+ image_grid_thw: torch.LongTensor | None = None,
+ # video_grid_thw: torch.LongTensor | None = None,
+ pooling_mask: torch.LongTensor | None = None,
+ **kwargs,
+ ) -> torch.Tensor:
+ if inputs_embeds is None:
+ inputs_embeds = self.base.model.embed_tokens(input_ids)
+ if pixel_values is not None:
+ pixel_values = pixel_values.type(self.base.visual.get_dtype())
+ image_embeds = self.base.visual(
+ pixel_values, grid_thw=image_grid_thw
+ ).to(inputs_embeds.device)
+ image_mask = input_ids == self.base.config.image_token_id
+ inputs_embeds[image_mask] = image_embeds
+ # if pixel_values_videos is not None:
+ # pixel_values_videos = pixel_values_videos.type(self.base.visual.get_dtype())
+ # video_embeds = self.base.visual(pixel_values_videos, grid_thw=video_grid_thw).to(inputs_embeds.device)
+ # video_mask = input_ids == self.base.config.video_token_id
+ # inputs_embeds[video_mask] = video_embeds
+ if attention_mask is not None:
+ attention_mask = attention_mask.to(inputs_embeds.device)
+
+ outputs = self.base.model(
+ input_ids=None,
+ position_ids=position_ids,
+ attention_mask=attention_mask,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ )
+
+ pooling_mask = attention_mask if pooling_mask is None else pooling_mask
+ left_padding = pooling_mask[:, -1].sum() == pooling_mask.shape[0] # TODO
+ if left_padding:
+ embeddings = outputs.last_hidden_state[:, -1]
+ else:
+ sequence_lengths = pooling_mask.sum(dim=1) - 1
+ batch_size = outputs.last_hidden_state.shape[0]
+ embeddings = outputs.last_hidden_state[
+ torch.arange(batch_size, device=outputs.last_hidden_state.device),
+ sequence_lengths,
+ ]
+ if self.normalize:
+ embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+ return embeddings.contiguous()
+
+ def embed(
+ self,
+ texts: list[str],
+ images: list[Image.Image],
+ device,
+ instruction=None,
+ **kwargs,
+ ):
+ instruction = instruction or self.defualt_instruction
+ # Inputs must be batched
+ input_texts, input_images = [], []
+ for t, i in zip(texts, images):
+ input_str = ""
+ if i is None:
+ input_images = None # All examples in the same batch are consistent
+ else:
+ input_str += "<|vision_start|><|image_pad|><|vision_end|>"
+ i = fetch_image(i)
+ input_images.append(i)
+ if t is not None:
+ input_str += t
+ msg = f"<|im_start|>system\n{instruction}<|im_end|>\n<|im_start|>user\n{input_str}<|im_end|>\n<|im_start|>assistant\n<|endoftext|>"
+ input_texts.append(msg)
+
+ inputs = self.processor(
+ text=input_texts,
+ images=input_images,
+ padding=True,
+ truncation=True,
+ max_length=self.max_length,
+ return_tensors="pt",
+ )
+ inputs = {k: v.to(device) for k, v in inputs.items()} # TODO
+ embeddings = self.forward(**inputs)
+ return embeddings
+
+
+class GmeQwen2VL:
+ def __init__(
+ self,
+ model_name: str = HF_GME_QWEN2VL_2B,
+ model_path: str | None = None,
+ device: str = "cuda" if torch.cuda.is_available() else "cpu",
+ min_image_tokens=4,
+ max_image_tokens=1280,
+ max_length=1800,
+ **kwargs,
+ ) -> None:
+ model_name = model_path or model_name
+ base = AutoModelForVision2Seq.from_pretrained(
+ model_name, torch_dtype=torch.float16, **kwargs
+ )
+ min_pixels = min_image_tokens * 28 * 28
+ max_pixels = max_image_tokens * 28 * 28
+ processor = AutoProcessor.from_pretrained(
+ model_name, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs
+ )
+ self.model = Encoder(base, processor, max_length=max_length)
+ self.model.eval()
+ self.device = device
+ self.sep = " "
+
+ def encode(
+ self,
+ sentences: list[str],
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ **kwargs: Any,
+ ):
+ return self.get_fused_embeddings(
+ texts=sentences, task_name=task_name, prompt_type=prompt_type, **kwargs
+ )
+
+ def encode_queries(self, queries: list[str], **kwargs):
+ kwargs.update(prompt_type=PromptType.query)
+ embeddings = self.encode(queries, **kwargs)
+ return embeddings
+
+ def encode_corpus(self, corpus: list[dict[str, str]], **kwargs):
+ if type(corpus) is dict:
+ sentences = [
+ (corpus["title"][i] + self.sep + corpus["text"][i]).strip()
+ if "title" in corpus
+ else corpus["text"][i].strip()
+ for i in range(len(corpus["text"]))
+ ]
+ else:
+ sentences = [
+ (doc["title"] + self.sep + doc["text"]).strip()
+ if "title" in doc
+ else doc["text"].strip()
+ for doc in corpus
+ ]
+ kwargs.update(prompt_type=PromptType.passage)
+ embeddings = self.encode(sentences, is_query=False, **kwargs)
+ return embeddings
+
+ def get_image_embeddings(self, images: list[Image.Image] | DataLoader, **kwargs):
+ return self.get_fused_embeddings(images=images, **kwargs)
+
+ def get_text_embeddings(self, texts: list[str], **kwargs):
+ return self.get_fused_embeddings(texts=texts, **kwargs)
+
+ def calculate_probs(self, text_embeddings, image_embeddings):
+ text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
+ image_embeddings = image_embeddings / image_embeddings.norm(
+ dim=-1, keepdim=True
+ )
+ logits = torch.matmul(image_embeddings, text_embeddings.T)
+ probs = (logits * 100).softmax(dim=-1)
+ return probs
+
+ def get_fused_embeddings(
+ self,
+ texts: list[str] | None = None,
+ images: list[Image.Image] | DataLoader | None = None,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ tqdm_mininterval: int = 15,
+ instruction=None,
+ **kwargs: Any,
+ ):
+ if prompt_type == PromptType.passage:
+ instruction = None
+ elif instruction is None:
+ instruction = get_gme_instruction(task_name)
+ self.model = self.model.to(self.device)
+
+ if isinstance(images, DataLoader):
+ image_loader = images
+ batch_size = image_loader.batch_size
+ image_loader.dataset.transform = None
+ else:
+ batch_size = kwargs.pop("batch_size", 32)
+ if images is None:
+ image_loader = None
+ else:
+ image_loader = DataLoader(
+ images,
+ batch_size=batch_size,
+ shuffle=False,
+ collate_fn=custom_collate_fn,
+ num_workers=min(math.floor(os.cpu_count() / 2), 8),
+ )
+
+ if texts is None:
+ assert image_loader is not None
+ n_batch = len(image_loader)
+ else:
+ n_batch = len(texts) // batch_size + int(len(texts) % batch_size > 0)
+ image_loader = image_loader or [None] * n_batch
+
+ all_embeddings = []
+ none_batch = [None] * batch_size
+ show_progress_bar = kwargs.pop("show_progress_bar", True)
+ pbar = tqdm(
+ total=n_batch,
+ disable=not show_progress_bar,
+ mininterval=tqdm_mininterval,
+ miniters=n_batch // 10,
+ desc="encode",
+ )
+ for n, (i, img_batch) in enumerate(
+ zip(range(0, n_batch * batch_size, batch_size), image_loader)
+ ):
+ text_batch = none_batch if texts is None else texts[i : i + batch_size]
+ img_batch = none_batch if img_batch is None else img_batch
+ inputs = dict(
+ texts=text_batch, images=img_batch, instruction=instruction, **kwargs
+ )
+ with torch.inference_mode():
+ embeddings = self.model.embed(**inputs, device=self.device)
+ all_embeddings.append(embeddings.cpu())
+ pbar.update(1)
+ pbar.close()
+ all_embeddings = torch.cat(all_embeddings, dim=0)
+ return all_embeddings
+
+
+def custom_collate_fn(batch):
+ return batch
+
+
+### Copied from qwen_vl_utils.vision_process.py
+IMAGE_FACTOR = 28
+MIN_PIXELS = 4 * 28 * 28
+MAX_PIXELS = 16384 * 28 * 28
+MAX_RATIO = 200
+
+
+def round_by_factor(number: int, factor: int) -> int:
+ """Returns the closest integer to 'number' that is divisible by 'factor'."""
+ return round(number / factor) * factor
+
+
+def ceil_by_factor(number: int, factor: int) -> int:
+ """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
+ return math.ceil(number / factor) * factor
+
+
+def floor_by_factor(number: int, factor: int) -> int:
+ """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
+ return math.floor(number / factor) * factor
+
+
+def smart_resize(
+ height: int,
+ width: int,
+ factor: int = IMAGE_FACTOR,
+ min_pixels: int = MIN_PIXELS,
+ max_pixels: int = MAX_PIXELS,
+) -> tuple[int, int]:
+ """Rescales the image so that the following conditions are met:
+
+ 1. Both dimensions (height and width) are divisible by 'factor'.
+
+ 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+
+ 3. The aspect ratio of the image is maintained as closely as possible.
+ """
+ h_bar = max(factor, round_by_factor(height, factor))
+ w_bar = max(factor, round_by_factor(width, factor))
+ if h_bar * w_bar > max_pixels:
+ beta = math.sqrt((height * width) / max_pixels)
+ h_bar = floor_by_factor(height / beta, factor)
+ w_bar = floor_by_factor(width / beta, factor)
+ elif h_bar * w_bar < min_pixels:
+ beta = math.sqrt(min_pixels / (height * width))
+ h_bar = ceil_by_factor(height * beta, factor)
+ w_bar = ceil_by_factor(width * beta, factor)
+
+ if max(h_bar, w_bar) / min(h_bar, w_bar) > MAX_RATIO:
+ logger.warning(
+ f"Absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(h_bar, w_bar) / min(h_bar, w_bar)}"
+ )
+ if h_bar > w_bar:
+ h_bar = w_bar * MAX_RATIO
+ else:
+ w_bar = h_bar * MAX_RATIO
+ return h_bar, w_bar
+
+
+def fetch_image(
+ image: str | Image.Image, size_factor: int = IMAGE_FACTOR
+) -> Image.Image:
+ image_obj = None
+ if isinstance(image, Image.Image):
+ image_obj = image
+ elif image.startswith("http://") or image.startswith("https://"):
+ import requests
+
+ image_obj = Image.open(requests.get(image, stream=True).raw)
+ elif image.startswith("file://"):
+ image_obj = Image.open(image[7:])
+ elif image.startswith("data:image"):
+ import base64
+ from io import BytesIO
+
+ if "base64," in image:
+ _, base64_data = image.split("base64,", 1)
+ data = base64.b64decode(base64_data)
+ image_obj = Image.open(BytesIO(data))
+ else:
+ image_obj = Image.open(image)
+ if image_obj is None:
+ raise ValueError(
+ f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}"
+ )
+ image = image_obj.convert("RGB")
+ ## resize
+ # if "resized_height" in ele and "resized_width" in ele:
+ # resized_height, resized_width = smart_resize(
+ # ele["resized_height"],
+ # ele["resized_width"],
+ # factor=size_factor,
+ # )
+ # else:
+ width, height = image.size
+ # min_pixels = ele.get("min_pixels", MIN_PIXELS)
+ # max_pixels = ele.get("max_pixels", MAX_PIXELS)
+ resized_height, resized_width = smart_resize(
+ height,
+ width,
+ factor=size_factor,
+ min_pixels=MIN_PIXELS,
+ max_pixels=MAX_PIXELS,
+ )
+ image = image.resize((resized_width, resized_height))
+
+ return image
+
+
+###
+training_data = {
+ "MSMARCO": ["train"],
+ "NQ": ["train"],
+ "NQHardNegatives": ["train"],
+ "NanoNQRetrieval": ["train"],
+ "NQ-PL": ["train"], # translation not trained on
+ "HotpotQA": ["train"],
+ "HotpotQA-PL": ["train"], # translation not trained on
+ "HotpotQAHardNegatives": ["train"],
+ # TriviaQA (Joshi et al., 2017),
+ # SQuAD (Rajpurkar et al., 2016),
+ "FEVER": ["train"],
+ # AllNLI for SimCSE (Gao et al., 2021), selecting a total of 1 million entries.
+ # ImageNet (Deng et al., 2009)
+ # LAION (Schuhmann et al., 2022),
+ # mscoco (Lin et al., 2014),
+ # Docmatix (Laurenc¸on et al., 2024)
+ # synthetic data
+ # M-BEIR (Wei et al., 2024)
+}
+
+
+gme_qwen2vl_2b = ModelMeta(
+ loader=partial(
+ GmeQwen2VL,
+ model_name=HF_GME_QWEN2VL_2B,
+ ),
+ name=HF_GME_QWEN2VL_2B,
+ languages=["eng_Latn", "cmn-Hans"],
+ open_weights=True,
+ revision="ce765ae71b8cdb208203cd8fb64a170b1b84293a",
+ release_date="2024-12-24",
+ modalities=["image", "text"],
+ n_parameters=2_210_000_000,
+ embed_dim=1536,
+ license="apache-2.0",
+ max_tokens=32768,
+ reference="https://huggingface.co/" + HF_GME_QWEN2VL_2B,
+ similarity_fn_name="cosine",
+ framework=["PyTorch"],
+ use_instructions=True,
+ public_training_code=None,
+ public_training_data=None,
+ training_datasets=training_data,
+)
+
+gme_qwen2vl_7b = ModelMeta(
+ loader=partial(
+ GmeQwen2VL,
+ model_name=HF_GME_QWEN2VL_7B,
+ ),
+ name=HF_GME_QWEN2VL_7B,
+ languages=["eng_Latn", "cmn-Hans"],
+ open_weights=True,
+ revision="477027a6480f8630363be77751f169cc3434b673",
+ release_date="2024-12-24",
+ modalities=["image", "text"],
+ n_parameters=8_290_000_000,
+ embed_dim=3584,
+ license="apache-2.0",
+ max_tokens=32768,
+ reference="https://huggingface.co/" + HF_GME_QWEN2VL_2B,
+ similarity_fn_name="cosine",
+ framework=["PyTorch"],
+ use_instructions=True,
+ public_training_code=None,
+ public_training_data=None,
+ training_datasets=training_data,
+)
diff --git a/mteb/models/gritlm_models.py b/mteb/models/gritlm_models.py
index 1e3a0b42bd..440779787b 100644
--- a/mteb/models/gritlm_models.py
+++ b/mteb/models/gritlm_models.py
@@ -54,6 +54,7 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str:
public_training_code="https://github.com/ContextualAI/gritlm",
public_training_data=None,
)
+
gritlm8x7b = ModelMeta(
loader=partial( # type: ignore
instruct_wrapper,
diff --git a/mteb/models/jina_clip.py b/mteb/models/jina_clip.py
new file mode 100644
index 0000000000..551c82c101
--- /dev/null
+++ b/mteb/models/jina_clip.py
@@ -0,0 +1,180 @@
+from __future__ import annotations
+
+from functools import partial
+from typing import Any
+
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoModel
+
+from mteb.encoder_interface import PromptType
+from mteb.model_meta import ModelMeta
+
+
+class JinaCLIPModelWrapper:
+ def __init__(
+ self,
+ model_name: str,
+ device: str = "cuda" if torch.cuda.is_available() else "cpu",
+ **kwargs: Any,
+ ):
+ self.model_name = model_name
+ self.device = device
+ self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True).to(
+ self.device
+ )
+
+ def get_text_embeddings(
+ self,
+ texts: list[str],
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ convert_to_numpy=False,
+ convert_to_tensor=True,
+ **kwargs: Any,
+ ):
+ all_text_embeddings = []
+
+ with torch.no_grad():
+ for i in tqdm(range(0, len(texts), batch_size)):
+ batch_texts = texts[i : i + batch_size]
+ text_outputs = self.model.encode_text(
+ batch_texts,
+ convert_to_numpy=convert_to_numpy,
+ convert_to_tensor=convert_to_tensor,
+ )
+ all_text_embeddings.append(text_outputs.cpu())
+
+ all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
+ return all_text_embeddings
+
+ def get_image_embeddings(
+ self,
+ images: list[Image.Image] | DataLoader,
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ convert_to_numpy=False,
+ convert_to_tensor=True,
+ **kwargs: Any,
+ ):
+ all_image_embeddings = []
+
+ if isinstance(images, DataLoader):
+ with torch.no_grad():
+ import torchvision.transforms.functional as F
+
+ for batch in tqdm(images):
+ image_outputs = self.model.encode_image(
+ [F.to_pil_image(b.to("cpu")) for b in batch],
+ convert_to_numpy=convert_to_numpy,
+ convert_to_tensor=convert_to_tensor,
+ )
+ all_image_embeddings.append(image_outputs.cpu())
+ else:
+ with torch.no_grad():
+ for i in tqdm(range(0, len(images), batch_size)):
+ batch_images = images[i : i + batch_size]
+ image_outputs = self.model.encode_image(
+ batch_images, convert_to_numpy=False, convert_to_tensor=True
+ )
+ all_image_embeddings.append(image_outputs.cpu())
+
+ all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
+ return all_image_embeddings
+
+ def calculate_probs(self, text_embeddings, image_embeddings):
+ text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
+ image_embeddings = image_embeddings / image_embeddings.norm(
+ dim=-1, keepdim=True
+ )
+ logits = torch.matmul(image_embeddings, text_embeddings.T)
+ probs = (logits * 100).softmax(dim=-1)
+ return probs
+
+ def get_fused_embeddings(
+ self,
+ texts: list[str] = None,
+ images: list[Image.Image] = None,
+ fusion_mode="sum",
+ **kwargs: Any,
+ ):
+ if texts is None and images is None:
+ raise ValueError("Either texts or images must be provided")
+
+ text_embeddings = None
+ image_embeddings = None
+
+ if texts is not None:
+ text_embeddings = self.get_text_embeddings(
+ texts, convert_to_numpy=False, convert_to_tensor=True, **kwargs
+ )
+
+ if images is not None:
+ image_embeddings = self.get_image_embeddings(
+ images, convert_to_numpy=False, convert_to_tensor=True, **kwargs
+ )
+
+ if text_embeddings is not None and image_embeddings is not None:
+ if len(text_embeddings) != len(image_embeddings):
+ raise ValueError(
+ "The number of texts and images must have the same length"
+ )
+ if fusion_mode == "sum":
+ fused_embeddings = text_embeddings + image_embeddings
+ else:
+ # to do: add other fusion mode
+ raise ValueError(f"fusion mode {fusion_mode} hasn't been implemented")
+ return fused_embeddings
+ elif text_embeddings is not None:
+ return text_embeddings
+ elif image_embeddings is not None:
+ return image_embeddings
+
+ def encode( # type: ignore
+ self,
+ sentences: list[str],
+ *,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ if "task_name" in kwargs:
+ kwargs.pop("task_name")
+ return self.model.encode_text(sentences, batch_size=batch_size, **kwargs)
+
+
+jina_clip_v1 = ModelMeta(
+ loader=partial(
+ JinaCLIPModelWrapper,
+ model_name="jinaai/jina-clip-v1",
+ ),
+ name="jinaai/jina-clip-v1",
+ languages=["eng_Latn"],
+ revision="06150c7c382d7a4faedc7d5a0d8cdb59308968f4",
+ release_date="2024-05-30",
+ modalities=["image", "text"],
+ n_parameters=223_000_000,
+ max_tokens=8192,
+ embed_dim=768,
+ license="apache-2.0",
+ open_weights=True,
+ public_training_code=None,
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/jinaai/jina-clip-v1",
+ similarity_fn_name=None,
+ use_instructions=True,
+ training_datasets={
+ # LAION400M
+ # ShareGPT4V
+ "MSMARCO": ["train"],
+ # NQ
+ # HotpotQA
+ # Natural Language Inference (NLI) dataset (Bowman et al., 2015)
+ },
+)
diff --git a/mteb/models/moco_models.py b/mteb/models/moco_models.py
new file mode 100644
index 0000000000..1c896331bc
--- /dev/null
+++ b/mteb/models/moco_models.py
@@ -0,0 +1,191 @@
+from __future__ import annotations
+
+from functools import partial
+from typing import Any
+
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from mteb.encoder_interface import PromptType
+from mteb.model_meta import ModelMeta
+
+
+def mocov3_loader(**kwargs):
+ try:
+ import timm
+ except ImportError:
+ raise ImportError("Please install `pip install timm` to use MOCOv3 models.")
+
+ class MOCOv3Wrapper:
+ """A wrapper class for MOCOv3 models that supports image encoding.
+ Text encoding and text-image fusion are not supported.
+ """
+
+ def __init__(
+ self,
+ model_name: str = "nyu-visionx/moco-v3-vit-b",
+ device: str = "cuda" if torch.cuda.is_available() else "cpu",
+ **kwargs: Any,
+ ):
+ self.model_name = model_name
+ self.device = device
+ name = "vit_base_patch16_224"
+ if "vit-l" in model_name:
+ name = "vit_large_patch16_224"
+ model = timm.create_model(
+ name,
+ pretrained=True,
+ num_classes=0,
+ pretrained_cfg_overlay={"hf_hub_id": model_name},
+ )
+
+ self.model = model.eval()
+
+ # get model specific transforms (normalization, resize)
+ data_config = timm.data.resolve_model_data_config(self.model)
+ self.processor = timm.data.create_transform(
+ **data_config, is_training=False
+ )
+
+ @staticmethod
+ def get_text_embeddings(
+ texts: list[str],
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ raise ValueError("MOCO models only support image encoding.")
+
+ def get_image_embeddings(
+ self,
+ images: list[Image.Image] | DataLoader,
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_image_embeddings = []
+
+ if isinstance(images, DataLoader):
+ import torchvision.transforms.functional as F
+
+ with torch.no_grad():
+ for batch in tqdm(images):
+ inputs = torch.vstack(
+ [
+ self.processor(F.to_pil_image(b.to("cpu"))).unsqueeze(0)
+ for b in batch
+ ]
+ )
+ output = self.model(
+ inputs
+ ) # output is (batch_size, num_features) shaped tensor
+ all_image_embeddings.append(output)
+ else:
+ with torch.no_grad():
+ for i in tqdm(range(0, len(images), batch_size)):
+ batch_images = images[i : i + batch_size]
+ inputs = torch.vstack(
+ [self.processor(b).unsqueeze(0) for b in batch_images]
+ )
+ output = self.model(
+ inputs
+ ) # output is (batch_size, num_features) shaped tensor
+ all_image_embeddings.append(output)
+
+ all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
+ return all_image_embeddings
+
+ @staticmethod
+ def calculate_probs(text_embeddings, image_embeddings):
+ raise ValueError("MOCO models only support image encoding.")
+
+ def get_fused_embeddings(
+ self,
+ texts: list[str] = None,
+ images: list[Image.Image] | DataLoader = None,
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ fusion_mode="sum",
+ **kwargs: Any,
+ ):
+ if texts is None and images is None:
+ raise ValueError("images must be provided for MOCO models")
+
+ text_embeddings = None
+ image_embeddings = None
+
+ if texts is not None:
+ text_embeddings = self.get_text_embeddings(texts, batch_size)
+
+ if images is not None:
+ image_embeddings = self.get_image_embeddings(images, batch_size)
+
+ if text_embeddings is not None and image_embeddings is not None:
+ raise ValueError("MOCO models only support image encoding.")
+ elif text_embeddings is not None:
+ return text_embeddings
+ elif image_embeddings is not None:
+ return image_embeddings
+
+ return MOCOv3Wrapper(**kwargs)
+
+
+mocov3_training_datasets = {
+ # imagenet
+}
+
+mocov3_vit_base = ModelMeta(
+ loader=partial(
+ mocov3_loader,
+ model_name="nyu-visionx/moco-v3-vit-b",
+ ),
+ name="nyu-visionx/moco-v3-vit-b",
+ languages=["eng_Latn"],
+ revision="7d091cd70772c5c0ecf7f00b5f12ca609a99d69d",
+ release_date="2024-06-03",
+ modalities=["image"],
+ n_parameters=86_600_000,
+ max_tokens=None,
+ embed_dim=768,
+ license="cc-by-nc-4.0",
+ open_weights=True,
+ public_training_code="https://github.com/facebookresearch/moco-v3",
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://github.com/facebookresearch/moco-v3",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=mocov3_training_datasets,
+)
+
+mocov3_vit_large = ModelMeta(
+ loader=partial(
+ mocov3_loader,
+ model_name="nyu-visionx/moco-v3-vit-l",
+ ),
+ name="nyu-visionx/moco-v3-vit-l",
+ languages=["eng_Latn"],
+ revision="7bf75358d616f39b9716148bf4e3425f3bd35b47",
+ release_date="2024-06-03",
+ modalities=["image"],
+ n_parameters=304_000_000,
+ max_tokens=None,
+ embed_dim=1024,
+ license="cc-by-nc-4.0",
+ open_weights=True,
+ public_training_code="https://github.com/facebookresearch/moco-v3",
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://github.com/facebookresearch/moco-v3",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=mocov3_training_datasets,
+)
diff --git a/mteb/models/nomic_models_vision.py b/mteb/models/nomic_models_vision.py
new file mode 100644
index 0000000000..4eb00316ae
--- /dev/null
+++ b/mteb/models/nomic_models_vision.py
@@ -0,0 +1,189 @@
+from __future__ import annotations
+
+from functools import partial
+from typing import Any
+
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoImageProcessor, AutoModel, AutoTokenizer
+
+from mteb.encoder_interface import PromptType
+from mteb.model_meta import ModelMeta
+
+
+class NomicVisionModelWrapper:
+ def __init__(
+ self,
+ vision_model_name: str,
+ text_model_name: str,
+ device: str = "cuda" if torch.cuda.is_available() else "cpu",
+ **kwargs: Any,
+ ):
+ self.vision_model_name = vision_model_name
+ self.text_model_name = text_model_name
+ self.device = device
+ self.processor = AutoImageProcessor.from_pretrained(self.vision_model_name)
+ self.vision_model = AutoModel.from_pretrained(
+ self.vision_model_name, trust_remote_code=True
+ ).to(self.device)
+ self.text_model = AutoModel.from_pretrained(
+ self.text_model_name, trust_remote_code=True
+ ).to(self.device)
+ self.tokenizer = AutoTokenizer.from_pretrained(self.text_model_name)
+
+ self.text_model.eval()
+ self.vision_model.eval()
+
+ def preprocess(
+ self,
+ texts: list[str],
+ images: list[Image.Image],
+ ):
+ return self.processor(
+ text=texts, images=images, return_tensors="pt", padding=True
+ )
+
+ def get_text_embeddings(
+ self,
+ texts: list[str],
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_text_embeddings = []
+
+ with torch.no_grad():
+ for i in tqdm(range(0, len(texts), batch_size)):
+ batch_texts = texts[i : i + batch_size]
+ inputs = self.tokenizer(
+ batch_texts, padding=True, truncation=True, return_tensors="pt"
+ )
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ text_outputs = self.text_model(**inputs)
+ text_embeddings = self.mean_pooling(
+ text_outputs, inputs["attention_mask"]
+ )
+ text_embeddings = F.layer_norm(
+ text_embeddings, normalized_shape=(text_embeddings.shape[1],)
+ )
+ text_embeddings = F.normalize(text_embeddings, p=2, dim=1)
+ all_text_embeddings.append(text_embeddings.cpu())
+
+ all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
+ return all_text_embeddings
+
+ def mean_pooling(self, model_output, attention_mask):
+ token_embeddings = model_output[0]
+ input_mask_expanded = (
+ attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+ )
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
+ input_mask_expanded.sum(1), min=1e-9
+ )
+
+ def get_image_embeddings(
+ self,
+ images: list[Image.Image] | DataLoader,
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_image_embeddings = []
+
+ if isinstance(images, DataLoader):
+ with torch.no_grad():
+ for batch in tqdm(images):
+ inputs = self.processor(images=batch, return_tensors="pt")
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ image_outputs = self.vision_model(**inputs).last_hidden_state
+ img_embeddings = F.normalize(image_outputs[:, 0], p=2, dim=1)
+ all_image_embeddings.append(img_embeddings.cpu())
+ else:
+ with torch.no_grad():
+ for i in tqdm(range(0, len(images), batch_size)):
+ batch_images = images[i : i + batch_size]
+ inputs = self.processor(images=batch_images, return_tensors="pt")
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ image_outputs = self.vision_model(**inputs).last_hidden_state
+ img_embeddings = F.normalize(image_outputs[:, 0], p=2, dim=1)
+ all_image_embeddings.append(img_embeddings.cpu())
+
+ all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
+ return all_image_embeddings
+
+ def calculate_probs(self, text_embeddings, image_embeddings):
+ # already normalized in the encoding functions
+ logits = torch.matmul(image_embeddings, text_embeddings.T)
+ probs = (logits * 100).softmax(dim=-1)
+ return probs
+
+ def get_fused_embeddings(
+ self,
+ texts: list[str] = None,
+ images: list[Image.Image] | DataLoader = None,
+ fusion_mode="sum",
+ **kwargs: Any,
+ ):
+ if texts is None and images is None:
+ raise ValueError("Either texts or images must be provided")
+
+ text_embeddings = None
+ image_embeddings = None
+
+ if texts is not None:
+ text_embeddings = self.get_text_embeddings(texts, **kwargs)
+
+ if images is not None:
+ image_embeddings = self.get_image_embeddings(images, **kwargs)
+
+ if text_embeddings is not None and image_embeddings is not None:
+ if len(text_embeddings) != len(image_embeddings):
+ raise ValueError(
+ "The number of texts and images must have the same length"
+ )
+ if fusion_mode == "sum":
+ fused_embeddings = text_embeddings + image_embeddings
+ else:
+ # to do: add other fusion mode
+ raise ValueError(f"fusion mode {fusion_mode} hasn't been implemented")
+ return fused_embeddings
+ elif text_embeddings is not None:
+ return text_embeddings
+ elif image_embeddings is not None:
+ return image_embeddings
+
+
+nomic_embed_vision_v1_5 = ModelMeta(
+ loader=partial(
+ NomicVisionModelWrapper,
+ vision_model_name="nomic-ai/nomic-embed-vision-v1.5",
+ text_model_name="nomic-ai/nomic-embed-text-v1.5",
+ ),
+ name="nomic-ai/nomic-embed-vision-v1.5",
+ languages=["eng_Latn"],
+ revision="af2246fffdab78d8458418480e4886a8e48b70a7",
+ release_date="2024-06-08",
+ modalities=["image", "text"],
+ n_parameters=92_900_000,
+ max_tokens=2048,
+ embed_dim=768,
+ license="apache-2.0",
+ open_weights=True,
+ public_training_code="https://github.com/nomic-ai/contrastors",
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/nomic-ai/nomic-embed-vision-v1.5",
+ similarity_fn_name=None,
+ use_instructions=True,
+ training_datasets={
+ # https://arxiv.org/pdf/2406.18587
+ # DFN-2B
+ },
+)
diff --git a/mteb/models/openclip_models.py b/mteb/models/openclip_models.py
new file mode 100644
index 0000000000..26e89e6cb6
--- /dev/null
+++ b/mteb/models/openclip_models.py
@@ -0,0 +1,359 @@
+from __future__ import annotations
+
+from functools import partial
+from typing import Any
+
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from mteb.encoder_interface import PromptType
+from mteb.model_meta import ModelMeta
+
+
+def openclip_loader(**kwargs):
+ try:
+ import open_clip
+ except ImportError:
+ raise ImportError("Please run `pip install open_clip_torch`.")
+
+ class OpenCLIPWrapper:
+ def __init__(
+ self,
+ model_name: str = "laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K",
+ device: str = "cuda" if torch.cuda.is_available() else "cpu",
+ **kwargs: Any,
+ ):
+ self.model_name = model_name
+ self.device = device
+ self.model, _, self.img_preprocess = open_clip.create_model_and_transforms(
+ f"hf-hub:{model_name}", device=device
+ )
+ self.model.eval()
+ self.tokenizer = open_clip.get_tokenizer(f"hf-hub:{model_name}")
+
+ def encode( # type: ignore
+ self,
+ sentences: list[str],
+ *,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ return self.get_text_embeddings(texts=sentences, batch_size=batch_size)
+
+ def get_text_embeddings(
+ self,
+ texts: list[str],
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ ):
+ all_text_embeddings = []
+
+ with torch.no_grad(), torch.cuda.amp.autocast():
+ for i in tqdm(range(0, len(texts), batch_size)):
+ batch_texts = texts[i : i + batch_size]
+ inputs = self.tokenizer(batch_texts)
+ text_outputs = self.model.encode_text(inputs.to(self.device))
+ all_text_embeddings.append(text_outputs.cpu())
+
+ all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
+ return all_text_embeddings
+
+ def get_image_embeddings(
+ self,
+ images: list[Image.Image] | DataLoader,
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_image_embeddings = []
+ if isinstance(images, DataLoader):
+ import torchvision.transforms.functional as F
+
+ with torch.no_grad(), torch.cuda.amp.autocast():
+ for batch in tqdm(images):
+ # import pdb; pdb.set_trace()
+ inputs = torch.vstack(
+ [
+ self.img_preprocess(F.to_pil_image(b)).unsqueeze(0)
+ for b in batch
+ ]
+ )
+ image_outputs = self.model.encode_image(inputs.to(self.device))
+ all_image_embeddings.append(image_outputs.cpu())
+ else:
+ with torch.no_grad(), torch.cuda.amp.autocast():
+ for i in tqdm(range(0, len(images), batch_size)):
+ batch_images = images[i : i + batch_size]
+ inputs = torch.vstack(
+ [self.img_preprocess(b).unsqueeze(0) for b in batch_images]
+ )
+ image_outputs = self.model.encode_image(inputs.to(self.device))
+ all_image_embeddings.append(image_outputs.cpu())
+
+ all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
+ return all_image_embeddings
+
+ def calculate_probs(self, text_embeddings, image_embeddings):
+ text_embeddings = text_embeddings / text_embeddings.norm(
+ dim=-1, keepdim=True
+ )
+ image_embeddings = image_embeddings / image_embeddings.norm(
+ dim=-1, keepdim=True
+ )
+ logits = torch.matmul(image_embeddings, text_embeddings.T)
+ probs = (logits * 100).softmax(dim=-1)
+ return probs
+
+ def get_fused_embeddings(
+ self,
+ texts: list[str] = None,
+ images: list[Image.Image] | DataLoader = None,
+ fusion_mode="sum",
+ **kwargs: Any,
+ ):
+ if texts is None and images is None:
+ raise ValueError("Either texts or images must be provided")
+
+ text_embeddings = None
+ image_embeddings = None
+
+ if texts is not None:
+ text_embeddings = self.get_text_embeddings(texts, **kwargs)
+
+ if images is not None:
+ image_embeddings = self.get_image_embeddings(images, **kwargs)
+
+ if text_embeddings is not None and image_embeddings is not None:
+ if len(text_embeddings) != len(image_embeddings):
+ raise ValueError(
+ "The number of texts and images must have the same length"
+ )
+ if fusion_mode == "sum":
+ fused_embeddings = text_embeddings + image_embeddings
+ else:
+ # to do: add other fusion mode
+ raise ValueError(
+ f"fusion mode {fusion_mode} hasn't been implemented"
+ )
+ return fused_embeddings
+ elif text_embeddings is not None:
+ return text_embeddings
+ elif image_embeddings is not None:
+ return image_embeddings
+
+ return OpenCLIPWrapper(**kwargs)
+
+
+CLIP_ViT_L_14_DataComp_XL_s13B_b90K = ModelMeta(
+ loader=partial(
+ openclip_loader,
+ model_name="laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K",
+ ),
+ name="laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K",
+ languages=["eng_Latn"],
+ revision="84c9828e63dc9a9351d1fe637c346d4c1c4db341",
+ release_date="2023-04-26",
+ modalities=["image", "text"],
+ n_parameters=428_000_000,
+ max_tokens=77,
+ embed_dim=768,
+ license="mit",
+ open_weights=True,
+ public_training_code="https://github.com/mlfoundations/open_clip",
+ public_training_data="https://huggingface.co/datasets/mlfoundations/datacomp_1b",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets={
+ # DataComp-1B
+ },
+)
+
+CLIP_ViT_B_32_DataComp_XL_s13B_b90K = ModelMeta(
+ loader=partial(
+ openclip_loader,
+ model_name="laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K",
+ ),
+ name="laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K",
+ languages=["eng_Latn"],
+ revision="f0e2ffa09cbadab3db6a261ec1ec56407ce42912",
+ release_date="2023-04-26",
+ modalities=["image", "text"],
+ n_parameters=151_000_000,
+ max_tokens=77,
+ embed_dim=512,
+ license="mit",
+ open_weights=True,
+ public_training_code="https://github.com/mlfoundations/open_clip",
+ public_training_data="https://huggingface.co/datasets/mlfoundations/datacomp_1b",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets={
+ # DataComp-1B
+ },
+)
+
+CLIP_ViT_B_16_DataComp_XL_s13B_b90K = ModelMeta(
+ loader=partial(
+ openclip_loader,
+ model_name="laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K",
+ ),
+ name="laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K",
+ languages=["eng_Latn"],
+ revision="d110532e8d4ff91c574ee60a342323f28468b287",
+ release_date="2023-04-26",
+ modalities=["image", "text"],
+ n_parameters=150_000_000,
+ max_tokens=77,
+ embed_dim=512,
+ license="mit",
+ open_weights=True,
+ public_training_code="https://github.com/mlfoundations/open_clip",
+ public_training_data="https://huggingface.co/datasets/mlfoundations/datacomp_1b",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets={
+ # DataComp-1B
+ },
+)
+
+CLIP_ViT_bigG_14_laion2B_39B_b160k = ModelMeta(
+ loader=partial(
+ openclip_loader,
+ model_name="laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
+ ),
+ name="laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
+ languages=["eng_Latn"],
+ revision="bc7788f151930d91b58474715fdce5524ad9a189",
+ release_date="2023-01-23",
+ modalities=["image", "text"],
+ n_parameters=2_540_000_000,
+ max_tokens=77,
+ embed_dim=1280,
+ license="mit",
+ open_weights=True,
+ public_training_code="https://github.com/mlfoundations/open_clip",
+ public_training_data="https://laion.ai/blog/laion-5b/",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets={
+ # 2 Billion sample English subset of LAION-5B
+ },
+)
+
+CLIP_ViT_g_14_laion2B_s34B_b88K = ModelMeta(
+ loader=partial(
+ openclip_loader,
+ model_name="laion/CLIP-ViT-g-14-laion2B-s34B-b88K",
+ ),
+ name="laion/CLIP-ViT-g-14-laion2B-s34B-b88K",
+ languages=["eng_Latn"],
+ revision="15efd0f6ac0c40c0f9da7becca03c974d7012604",
+ release_date="2023-03-06",
+ modalities=["image", "text"],
+ n_parameters=1_367_000_000,
+ max_tokens=77,
+ embed_dim=1024,
+ license="mit",
+ open_weights=True,
+ public_training_code="https://github.com/mlfoundations/open_clip",
+ public_training_data="https://laion.ai/blog/laion-5b/",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/laion/CLIP-ViT-g-14-laion2B-s34B-b88K",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets={
+ # 2 Billion sample English subset of LAION-5B
+ },
+)
+
+CLIP_ViT_H_14_laion2B_s32B_b79K = ModelMeta(
+ loader=partial(
+ openclip_loader,
+ model_name="laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
+ ),
+ name="laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
+ languages=["eng_Latn"],
+ revision="de081ac0a0ca8dc9d1533eed1ae884bb8ae1404b",
+ release_date="2022-09-15",
+ modalities=["image", "text"],
+ n_parameters=986_000_000,
+ max_tokens=77,
+ embed_dim=1024,
+ license="mit",
+ open_weights=True,
+ public_training_code="https://github.com/mlfoundations/open_clip",
+ public_training_data="https://laion.ai/blog/laion-5b/",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets={
+ # 2 Billion sample English subset of LAION-5B
+ },
+)
+
+CLIP_ViT_L_14_laion2B_s32B_b82K = ModelMeta(
+ loader=partial(
+ openclip_loader,
+ model_name="laion/CLIP-ViT-L-14-laion2B-s32B-b82K",
+ ),
+ name="laion/CLIP-ViT-L-14-laion2B-s32B-b82K",
+ languages=["eng_Latn"],
+ revision="1627032197142fbe2a7cfec626f4ced3ae60d07a",
+ release_date="2022-09-15",
+ modalities=["image", "text"],
+ n_parameters=428_000_000,
+ max_tokens=77,
+ embed_dim=768,
+ license="mit",
+ open_weights=True,
+ public_training_code="https://github.com/mlfoundations/open_clip",
+ public_training_data="https://laion.ai/blog/laion-5b/",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/laion/CLIP-ViT-L-14-laion2B-s32B-b82K",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets={
+ # 2 Billion sample English subset of LAION-5B
+ },
+)
+
+CLIP_ViT_B_32_laion2B_s34B_b79K = ModelMeta(
+ loader=partial(
+ openclip_loader,
+ model_name="laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
+ ),
+ name="laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
+ languages=["eng_Latn"],
+ revision="08f73555f1b2fb7c82058aebbd492887a94968ef",
+ release_date="2022-09-15",
+ modalities=["image", "text"],
+ n_parameters=151_000_000,
+ max_tokens=77,
+ embed_dim=512,
+ license="mit",
+ open_weights=True,
+ public_training_code="https://github.com/mlfoundations/open_clip",
+ public_training_data="https://laion.ai/blog/laion-5b/",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets={
+ # 2 Billion sample English subset of LAION-5B
+ },
+)
diff --git a/mteb/models/overview.py b/mteb/models/overview.py
index 399d8359c5..9137da2a79 100644
--- a/mteb/models/overview.py
+++ b/mteb/models/overview.py
@@ -12,15 +12,23 @@
from mteb.encoder_interface import Encoder
from mteb.model_meta import ModelMeta
from mteb.models import (
+ align_models,
arctic_models,
bedrock_models,
bge_models,
+ blip2_models,
+ blip_models,
bm25,
cde_models,
+ clip_models,
cohere_models,
+ cohere_v,
colbert_models,
+ dino_models,
e5_instruct,
e5_models,
+ e5_v,
+ evaclip_models,
gme_models,
google_models,
gritlm_models,
@@ -28,18 +36,22 @@
ibm_granite_models,
inf_models,
jasper_models,
+ jina_clip,
jina_models,
lens_models,
linq_models,
llm2vec_models,
misc_models,
+ moco_models,
model2vec_models,
moka_models,
mxbai_models,
no_instruct_sentence_models,
nomic_models,
+ nomic_models_vision,
nvidia_models,
openai_models,
+ openclip_models,
piccolo_models,
promptriever_models,
repllama_models,
@@ -48,42 +60,60 @@
ru_sentence_models,
salesforce_models,
sentence_transformers_models,
+ siglip_models,
stella_models,
text2vec_models,
uae_models,
+ vista_models,
+ vlm2vec_models,
voyage_models,
+ voyage_v,
)
logger = logging.getLogger(__name__)
model_modules = [
+ align_models,
arctic_models,
+ bedrock_models,
bge_models,
+ blip2_models,
+ blip_models,
bm25,
+ clip_models,
cde_models,
cohere_models,
+ cohere_v,
colbert_models,
+ dino_models,
e5_instruct,
e5_models,
- google_models,
+ e5_v,
+ evaclip_models,
+ gme_models,
google_models,
gritlm_models,
gte_models,
gme_models,
ibm_granite_models,
inf_models,
+ jasper_models,
jina_models,
+ jina_clip,
lens_models,
linq_models,
llm2vec_models,
- mxbai_models,
+ misc_models,
model2vec_models,
moka_models,
- misc_models,
- nomic_models,
+ moco_models,
+ mxbai_models,
no_instruct_sentence_models,
+ nomic_models,
+ nomic_models_vision,
nvidia_models,
openai_models,
+ openclip_models,
piccolo_models,
promptriever_models,
repllama_models,
@@ -92,16 +122,12 @@
ru_sentence_models,
salesforce_models,
sentence_transformers_models,
- voyage_models,
- google_models,
- repllama_models,
- promptriever_models,
- jina_models,
- jasper_models,
- uae_models,
- text2vec_models,
+ siglip_models,
+ vista_models,
+ vlm2vec_models,
+ voyage_v,
stella_models,
- bedrock_models,
+ text2vec_models,
uae_models,
voyage_models,
]
diff --git a/mteb/models/siglip_models.py b/mteb/models/siglip_models.py
new file mode 100644
index 0000000000..b7543afc68
--- /dev/null
+++ b/mteb/models/siglip_models.py
@@ -0,0 +1,400 @@
+from __future__ import annotations
+
+from functools import partial
+from typing import Any
+
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoModel, AutoProcessor
+
+from mteb.encoder_interface import PromptType
+from mteb.model_meta import ModelMeta
+
+
+class SiglipModelWrapper:
+ def __init__(
+ self,
+ model_name: str,
+ device: str = "cuda" if torch.cuda.is_available() else "cpu",
+ **kwargs: Any,
+ ):
+ self.model_name = model_name
+ self.device = device
+ self.model = AutoModel.from_pretrained(model_name).to(self.device)
+ self.processor = AutoProcessor.from_pretrained(model_name)
+
+ def preprocess(
+ self,
+ texts: list[str],
+ images: list[Image.Image],
+ ):
+ return self.processor(
+ text=texts, images=images, return_tensors="pt", padding=True
+ )
+
+ def get_text_embeddings(
+ self,
+ texts: list[str],
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_text_embeddings = []
+
+ with torch.no_grad():
+ for i in tqdm(range(0, len(texts), batch_size)):
+ batch_texts = texts[i : i + batch_size]
+ inputs = self.processor(
+ text=batch_texts,
+ return_tensors="pt",
+ padding="max_length",
+ truncation=True,
+ )
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ text_outputs = self.model.get_text_features(**inputs)
+ all_text_embeddings.append(text_outputs.cpu())
+
+ all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
+ return all_text_embeddings
+
+ def get_image_embeddings(
+ self,
+ images: list[Image.Image] | DataLoader,
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_image_embeddings = []
+
+ if isinstance(images, DataLoader):
+ with torch.no_grad():
+ for batch in tqdm(images):
+ inputs = self.processor(
+ images=batch, return_tensors="pt", padding=True
+ )
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ image_outputs = self.model.get_image_features(**inputs)
+ all_image_embeddings.append(image_outputs.cpu())
+ else:
+ with torch.no_grad():
+ for i in tqdm(range(0, len(images), batch_size)):
+ batch_images = images[i : i + batch_size]
+ batch_images = [
+ img.convert("RGB")
+ if isinstance(img, Image.Image) and img.mode != "RGB"
+ else img
+ for img in batch_images
+ ]
+ inputs = self.processor(
+ images=batch_images, return_tensors="pt", padding=True
+ )
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ image_outputs = self.model.get_image_features(**inputs)
+ all_image_embeddings.append(image_outputs.cpu())
+
+ all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
+ return all_image_embeddings
+
+ def calculate_probs(self, text_embeddings, image_embeddings):
+ # normalized features
+ image_embeddings = image_embeddings / image_embeddings.norm(
+ p=2, dim=-1, keepdim=True
+ )
+ text_embeddings = text_embeddings / text_embeddings.norm(
+ p=2, dim=-1, keepdim=True
+ )
+
+ # cosine similarity as logits
+ logits_per_text = torch.matmul(
+ text_embeddings, image_embeddings.t().to(text_embeddings.device)
+ ) * self.model.logit_scale.exp().to(
+ text_embeddings.device
+ ) + self.model.logit_bias.to(text_embeddings.device)
+ logits_per_image = logits_per_text.t()
+ return logits_per_image
+
+ def get_fused_embeddings(
+ self,
+ texts: list[str] = None,
+ images: list[Image.Image] | DataLoader = None,
+ fusion_mode="sum",
+ **kwargs: Any,
+ ):
+ if texts is None and images is None:
+ raise ValueError("Either texts or images must be provided")
+
+ text_embeddings = None
+ image_embeddings = None
+
+ if texts is not None:
+ text_embeddings = self.get_text_embeddings(texts, **kwargs)
+
+ if images is not None:
+ image_embeddings = self.get_image_embeddings(images, **kwargs)
+
+ if text_embeddings is not None and image_embeddings is not None:
+ if len(text_embeddings) != len(image_embeddings):
+ raise ValueError(
+ "The number of texts and images must have the same length"
+ )
+ if fusion_mode == "sum":
+ fused_embeddings = text_embeddings + image_embeddings
+ else:
+ # to do: add other fusion mode
+ raise ValueError(f"fusion mode {fusion_mode} hasn't been implemented")
+ return fused_embeddings
+ elif text_embeddings is not None:
+ return text_embeddings
+ elif image_embeddings is not None:
+ return image_embeddings
+
+
+siglip_training_datasets = {
+ # WebLI https://arxiv.org/abs/2209.06794
+}
+
+siglip_so400m_patch14_224 = ModelMeta(
+ loader=partial(
+ SiglipModelWrapper,
+ model_name="google/siglip-so400m-patch14-224",
+ ),
+ name="google/siglip-so400m-patch14-224",
+ languages=["eng_Latn"],
+ revision="d04cf29fca7b6374f74d8bea1969314492266b5e",
+ release_date="2024-01-08",
+ modalities=["image", "text"],
+ n_parameters=877_000_000,
+ max_tokens=16,
+ embed_dim=1152,
+ license="apache-2.0",
+ open_weights=True,
+ public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py",
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/google/siglip-so400m-patch14-224",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=siglip_training_datasets,
+)
+
+siglip_so400m_patch14_384 = ModelMeta(
+ loader=partial(
+ SiglipModelWrapper,
+ model_name="google/siglip-so400m-patch14-384",
+ ),
+ name="google/siglip-so400m-patch14-384",
+ languages=["eng_Latn"],
+ revision="9fdffc58afc957d1a03a25b10dba0329ab15c2a3",
+ release_date="2024-01-08",
+ modalities=["image", "text"],
+ n_parameters=878_000_000,
+ max_tokens=64,
+ embed_dim=1152,
+ license="apache-2.0",
+ open_weights=True,
+ public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py",
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/google/siglip-so400m-patch14-384",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=siglip_training_datasets,
+)
+
+siglip_so400m_patch16_256_i18n = ModelMeta(
+ loader=partial(
+ SiglipModelWrapper,
+ model_name="google/siglip-so400m-patch16-256-i18n",
+ ),
+ name="google/siglip-so400m-patch16-256-i18n",
+ languages=["eng_Latn"],
+ revision="365d321c0cfdea96bc28e3a29787a11a062681a1",
+ release_date="2024-01-08",
+ modalities=["image", "text"],
+ n_parameters=1_130_000_000,
+ max_tokens=64,
+ embed_dim=1152,
+ license="apache-2.0",
+ open_weights=True,
+ public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py",
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/google/siglip-so400m-patch16-256-i18n",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=siglip_training_datasets,
+)
+
+siglip_base_patch16_256_multilingual = ModelMeta(
+ loader=partial(
+ SiglipModelWrapper,
+ model_name="google/siglip-base-patch16-256-multilingual",
+ ),
+ name="google/siglip-base-patch16-256-multilingual",
+ languages=["eng_Latn"],
+ revision="8952a4eafcde3cb7ab46b1dd629b33f8784ca9c6",
+ release_date="2024-01-08",
+ modalities=["image", "text"],
+ n_parameters=371_000_000,
+ max_tokens=64,
+ embed_dim=768,
+ license="apache-2.0",
+ open_weights=True,
+ public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py",
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/google/siglip-base-patch16-256-multilingual",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=siglip_training_datasets,
+)
+
+siglip_base_patch16_256 = ModelMeta(
+ loader=partial(
+ SiglipModelWrapper,
+ model_name="google/siglip-base-patch16-256",
+ ),
+ name="google/siglip-base-patch16-256",
+ languages=["eng_Latn"],
+ revision="b078df89e446d623010d890864d4207fe6399f61",
+ release_date="2024-01-08",
+ modalities=["image", "text"],
+ n_parameters=203_000_000,
+ max_tokens=64,
+ embed_dim=768,
+ license="apache-2.0",
+ open_weights=True,
+ public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py",
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/google/siglip-base-patch16-256",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=siglip_training_datasets,
+)
+
+siglip_base_patch16_512 = ModelMeta(
+ loader=partial(
+ SiglipModelWrapper,
+ model_name="google/siglip-base-patch16-512",
+ ),
+ name="google/siglip-base-patch16-512",
+ languages=["eng_Latn"],
+ revision="753a949581523b60257d93e18391e8c27f72eb22",
+ release_date="2024-01-08",
+ modalities=["image", "text"],
+ n_parameters=204_000_000,
+ max_tokens=64,
+ embed_dim=768,
+ license="apache-2.0",
+ open_weights=True,
+ public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py",
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/google/siglip-base-patch16-512",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=siglip_training_datasets,
+)
+
+siglip_base_patch16_384 = ModelMeta(
+ loader=partial(
+ SiglipModelWrapper,
+ model_name="google/siglip-base-patch16-384",
+ ),
+ name="google/siglip-base-patch16-384",
+ languages=["eng_Latn"],
+ revision="41aec1c83b32e0a6fca20ad88ba058aa5b5ea394",
+ release_date="2024-01-08",
+ modalities=["image", "text"],
+ n_parameters=203_000_000,
+ max_tokens=64,
+ embed_dim=768,
+ license="apache-2.0",
+ open_weights=True,
+ public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py",
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/google/siglip-base-patch16-384",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=siglip_training_datasets,
+)
+
+siglip_base_patch16_224 = ModelMeta(
+ loader=partial(
+ SiglipModelWrapper,
+ model_name="google/siglip-base-patch16-224",
+ ),
+ name="google/siglip-base-patch16-224",
+ languages=["eng_Latn"],
+ revision="7fd15f0689c79d79e38b1c2e2e2370a7bf2761ed",
+ release_date="2024-01-08",
+ modalities=["image", "text"],
+ n_parameters=203_000_000,
+ max_tokens=64,
+ embed_dim=768,
+ license="apache-2.0",
+ open_weights=True,
+ public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py",
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/google/siglip-base-patch16-224",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=siglip_training_datasets,
+)
+
+siglip_large_patch16_256 = ModelMeta(
+ loader=partial(
+ SiglipModelWrapper,
+ model_name="google/siglip-large-patch16-256",
+ ),
+ name="google/siglip-large-patch16-256",
+ languages=["eng_Latn"],
+ revision="d0da9f876e7d66b4e250cd2450c3ba2ce735e447",
+ release_date="2024-01-08",
+ modalities=["image", "text"],
+ n_parameters=652_000_000,
+ max_tokens=64,
+ embed_dim=1024,
+ license="apache-2.0",
+ open_weights=True,
+ public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py",
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/google/siglip-large-patch16-256",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=siglip_training_datasets,
+)
+
+siglip_large_patch16_384 = ModelMeta(
+ loader=partial(
+ SiglipModelWrapper,
+ model_name="google/siglip-large-patch16-384",
+ ),
+ name="google/siglip-large-patch16-384",
+ languages=["eng_Latn"],
+ revision="ce005573a40965dfd21fd937fbdeeebf2439fc35",
+ release_date="2024-01-08",
+ modalities=["image", "text"],
+ n_parameters=652_000_000,
+ max_tokens=64,
+ embed_dim=1024,
+ license="apache-2.0",
+ open_weights=True,
+ public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py",
+ public_training_data=None,
+ framework=["PyTorch"],
+ reference="https://huggingface.co/google/siglip-large-patch16-384",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=siglip_training_datasets,
+)
diff --git a/mteb/models/vista_models.py b/mteb/models/vista_models.py
new file mode 100644
index 0000000000..1344ec87cd
--- /dev/null
+++ b/mteb/models/vista_models.py
@@ -0,0 +1,289 @@
+from __future__ import annotations
+
+from functools import partial
+from typing import Any
+
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+from torchvision import transforms
+from tqdm import tqdm
+
+from mteb.encoder_interface import PromptType
+from mteb.model_meta import ModelMeta
+
+tensor_to_image = transforms.Compose([transforms.ToPILImage()])
+
+
+def vista_loader(**kwargs):
+ try: # a temporal fix for the dependency issues of vista models.
+ from visual_bge.modeling import Visualized_BGE
+ except ImportError:
+ raise ImportError(
+ "Please install `visual_bge`, refer to https://github.com/FlagOpen/FlagEmbedding/tree/master/research/visual_bge#install-flagembedding."
+ )
+
+ class VisualizedBGEWrapper(Visualized_BGE):
+ def __init__(
+ self,
+ model_name_bge: str = None,
+ model_weight=None,
+ normlized: bool = True,
+ sentence_pooling_method: str = "cls",
+ negatives_cross_device: bool = False,
+ temperature: float = 0.02,
+ from_pretrained=None,
+ image_tokens_num: int = None,
+ **kwargs: Any,
+ ):
+ super().__init__(
+ model_name_bge=model_name_bge,
+ model_weight=model_weight,
+ normlized=normlized,
+ sentence_pooling_method=sentence_pooling_method,
+ negatives_cross_device=negatives_cross_device,
+ temperature=temperature,
+ from_pretrained=from_pretrained,
+ )
+ self.image_tokens_num = image_tokens_num
+ self.max_text_len_with_image = (
+ self.tokenizer.model_max_length - image_tokens_num
+ )
+ self.eval()
+
+ def encode_text(self, texts):
+ """Currently override Visualized_BGE's the original implementation
+ to fix attention_mask & embedding_output dtype misalignment
+ """
+ input_ids = texts["input_ids"]
+ attention_mask = texts["attention_mask"]
+
+ input_shape = input_ids.size()
+ device = input_ids.device
+
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+ head_mask = [None] * self.depth
+ extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+ attention_mask, input_shape
+ )
+
+ embedding_output = self.bge_embeddings(
+ input_ids=input_ids,
+ position_ids=None,
+ token_type_ids=token_type_ids,
+ inputs_embeds=None,
+ past_key_values_length=0,
+ )
+
+ # this line is missing in vista, currently override "encode_text" only to fix this.
+ extended_attention_mask = extended_attention_mask.to(embedding_output.dtype)
+
+ encoder_outputs = self.bge_encoder(
+ embedding_output,
+ attention_mask=extended_attention_mask,
+ head_mask=head_mask,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ past_key_values=None,
+ use_cache=False,
+ output_attentions=False,
+ output_hidden_states=False,
+ return_dict=True,
+ )
+ sequence_output = encoder_outputs[0]
+
+ t_reps = self.sentence_embedding(
+ sequence_output, texts["attention_mask"]
+ ) # tensor: reps with pooling
+ if self.normlized:
+ t_reps = torch.nn.functional.normalize(t_reps, dim=-1)
+ return t_reps.contiguous()
+
+ def encode(
+ self,
+ images=None,
+ texts=None,
+ tensors=False,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ **kwargs: Any,
+ ):
+ if images is not None:
+ if isinstance(images, list):
+ if not tensors:
+ images = [
+ self.preprocess_val(
+ img if isinstance(img, Image.Image) else Image.open(img)
+ )
+ for img in images
+ ]
+ else:
+ images = [
+ self.preprocess_val(tensor_to_image(image))
+ for image in images
+ ]
+ images = torch.stack(images)
+ if texts is not None:
+ texts = self.tokenizer(
+ texts,
+ return_tensors="pt",
+ padding=True,
+ truncation=True,
+ max_length=self.max_text_len_with_image,
+ )
+ return self.encode_mm(images.to(self.device), texts.to(self.device))
+ else:
+ return self.encode_image(images.to(self.device))
+ else:
+ if texts is not None:
+ texts = self.tokenizer(
+ texts, return_tensors="pt", padding=True, truncation=True
+ )
+ return self.encode_text(texts.to(self.device))
+ else:
+ return None
+
+ def get_text_embeddings(
+ self,
+ texts: list[str],
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_text_embeddings = []
+ for i in tqdm(range(0, len(texts), batch_size)):
+ batch_texts = texts[i : i + batch_size]
+ with torch.no_grad():
+ batch_embeddings = self.encode(texts=batch_texts)
+ all_text_embeddings.append(batch_embeddings.cpu())
+ return torch.cat(all_text_embeddings, dim=0)
+
+ def get_image_embeddings(
+ self,
+ images: list[Image.Image] | DataLoader,
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_image_embeddings = []
+
+ if isinstance(images, DataLoader):
+ with torch.no_grad():
+ for batch in tqdm(images):
+ batch_embeddings = self.encode(images=batch, tensors=True)
+ all_image_embeddings.append(batch_embeddings.cpu())
+ else:
+ with torch.no_grad():
+ for i in tqdm(range(0, len(images), batch_size)):
+ batch_images = images[i : i + batch_size]
+ batch_embeddings = self.encode(images=batch_images)
+ all_image_embeddings.append(batch_embeddings.cpu())
+ return torch.cat(all_image_embeddings, dim=0)
+
+ def get_fused_embeddings(
+ self,
+ texts: list[str] = None,
+ images: list[Image.Image] | DataLoader = None,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_embeddings = []
+
+ if isinstance(images, DataLoader):
+ with torch.no_grad():
+ for index, batch_images in enumerate(tqdm(images)):
+ batch_texts = texts[
+ index * batch_size : (index + 1) * batch_size
+ ]
+ batch_embeddings = self.encode(
+ images=batch_images, texts=batch_texts, tensors=True
+ )
+ all_embeddings.append(batch_embeddings.cpu())
+ else:
+ assert len(texts) == len(images)
+ with torch.no_grad():
+ for i in tqdm(range(0, len(texts), batch_size)):
+ batch_texts = texts[i : i + batch_size]
+ batch_images = images[i : i + batch_size]
+ batch_embeddings = self.encode(
+ images=batch_images, texts=batch_texts
+ )
+ all_embeddings.append(batch_embeddings.cpu())
+ return torch.cat(all_embeddings, dim=0)
+
+ def calculate_probs(self, text_embeddings, image_embeddings):
+ text_embeddings = text_embeddings / text_embeddings.norm(
+ dim=-1, keepdim=True
+ )
+ image_embeddings = image_embeddings / image_embeddings.norm(
+ dim=-1, keepdim=True
+ )
+ logits = torch.matmul(image_embeddings, text_embeddings.T)
+ probs = (logits * 100).softmax(dim=-1)
+ return probs
+
+ return VisualizedBGEWrapper(**kwargs)
+
+
+vista_training_datasets = {
+ # VISTA_S2
+}
+
+visualized_bge_base = ModelMeta(
+ loader=partial(
+ vista_loader,
+ model_name_bge="BAAI/bge-base-en-v1.5",
+ model_weight="visualized_base_en_V1.5.pth",
+ image_tokens_num=196,
+ ),
+ name="BAAI/bge-visualized-base",
+ languages=["eng_Latn"],
+ revision="98db10b10d22620010d06f11733346e1c98c34aa",
+ release_date="2024-06-06",
+ modalities=["image", "text"],
+ n_parameters=196_000_000,
+ max_tokens=77,
+ embed_dim=768,
+ license=None,
+ open_weights=True,
+ public_training_code=None,
+ public_training_data="https://huggingface.co/datasets/JUNJIE99/VISTA_S2",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/BAAI/bge-visualized",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=vista_training_datasets,
+)
+
+visualized_bge_m3 = ModelMeta(
+ loader=partial(
+ vista_loader,
+ model_name_bge="BAAI/bge-m3",
+ model_weight="visualized_m3.pth",
+ image_tokens_num=256,
+ ),
+ name="BAAI/bge-visualized-m3",
+ languages=["eng_Latn"],
+ revision="98db10b10d22620010d06f11733346e1c98c34aa",
+ release_date="2024-06-06",
+ modalities=["image", "text"],
+ n_parameters=None,
+ max_tokens=77,
+ embed_dim=1024,
+ license=None,
+ open_weights=True,
+ public_training_code=None,
+ public_training_data="https://huggingface.co/datasets/JUNJIE99/VISTA_S2",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/BAAI/bge-visualized",
+ similarity_fn_name=None,
+ use_instructions=False,
+ training_datasets=vista_training_datasets,
+)
diff --git a/mteb/models/vlm2vec_models.py b/mteb/models/vlm2vec_models.py
new file mode 100644
index 0000000000..7ca458c6a0
--- /dev/null
+++ b/mteb/models/vlm2vec_models.py
@@ -0,0 +1,415 @@
+from __future__ import annotations
+
+import logging
+from functools import partial
+from typing import Any, Literal
+
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor
+
+from mteb.encoder_interface import PromptType
+from mteb.model_meta import ModelMeta
+
+logging.basicConfig(level=logging.WARNING)
+logger = logging.getLogger(__name__)
+
+EncodeTypes = Literal["query", "passage"]
+
+
+class VLM2VecWrapper:
+ """Adapted from https://github.com/TIGER-AI-Lab/VLM2Vec/blob/main/src/model.py"""
+
+ def __init__(
+ self,
+ model_name: str = "TIGER-Lab/VLM2Vec-LoRA",
+ device: str = "cuda" if torch.cuda.is_available() else "cpu",
+ **kwargs,
+ ):
+ try:
+ import flash_attn # noqa
+ from peft import LoraConfig, PeftModel # noqa
+ except ImportError:
+ logger.warning(
+ "VLM2Vec models were trained with flash attention enabled. For optimal performance, please install the `flash_attn` package with `pip install flash-attn --no-build-isolation`."
+ )
+
+ self.pooling = "last"
+ self.normalize = True
+ self.temperature = 1.0
+ self.hidden_size = 4096
+ self.device = device
+
+ # Loading the base model
+ base_model_name = "microsoft/Phi-3.5-vision-instruct"
+ config = AutoConfig.from_pretrained(base_model_name, trust_remote_code=True)
+ config.use_cache = False
+ config.padding_side = "right"
+
+ checkpoint_path = model_name if model_name else base_model_name
+ base_model = AutoModelForCausalLM.from_pretrained(
+ checkpoint_path,
+ config=config,
+ attn_implementation="flash_attention_2",
+ torch_dtype=torch.bfloat16,
+ trust_remote_code=True,
+ )
+ base_model.padding_side = "right"
+
+ # Building the model on top of the base
+ if "LoRA" in model_name:
+ lora_config = LoraConfig.from_pretrained(checkpoint_path)
+ lora_model = PeftModel.from_pretrained(
+ base_model, checkpoint_path, config=lora_config
+ )
+ merged_model = lora_model.merge_and_unload()
+ model = merged_model.to(torch.bfloat16) # propagate dtype.
+ else:
+ model = base_model.to(torch.bfloat16)
+
+ model.eval()
+ model.to(device)
+ self.mdl = model
+
+ self.processor = AutoProcessor.from_pretrained(
+ base_model_name,
+ trust_remote_code=True,
+ num_crops=4,
+ )
+
+ def encode(
+ self,
+ sentences: list[str],
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ **kwargs: Any,
+ ):
+ return self.get_text_embeddings(texts=sentences)
+
+ def encode_input(self, input):
+ hidden_states = self.mdl(**input, return_dict=True, output_hidden_states=True)
+ hidden_states = hidden_states.hidden_states[-1]
+ pooled_output = self._pooling(hidden_states, input["attention_mask"])
+ return pooled_output
+
+ def _pooling(self, last_hidden_state, attention_mask):
+ if self.pooling == "last":
+ sequence_lengths = attention_mask.sum(dim=1) - 1
+ batch_size = last_hidden_state.shape[0]
+ reps = last_hidden_state[
+ torch.arange(batch_size, device=last_hidden_state.device),
+ sequence_lengths,
+ ]
+ else:
+ raise NotImplementedError
+ if self.normalize:
+ reps = torch.nn.functional.normalize(reps, p=2, dim=-1)
+ return reps
+
+ # reference: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/main/src/collator.py
+ def get_image_embeddings(
+ self,
+ images: list[Image.Image] | DataLoader,
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ text = "<|image_1|> Represent the given image."
+ all_image_embeddings = []
+ if isinstance(images, DataLoader):
+ import torchvision.transforms.functional as F
+
+ with torch.no_grad():
+ for batch in tqdm(images):
+ input_ids, pixel_values, image_sizes = [], [], []
+ for b in batch:
+ inputs = self.processor(
+ text,
+ [F.to_pil_image(b.to("cpu"))],
+ return_tensors="pt",
+ max_length=256,
+ truncation=True,
+ )
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ input_ids.append(inputs["input_ids"].squeeze(0).unsqueeze(1))
+ pixel_values.append(inputs["pixel_values"])
+ image_sizes.append(inputs["image_sizes"])
+
+ input_ids = torch._C._nn.pad_sequence(
+ input_ids,
+ batch_first=True,
+ padding_value=self.processor.tokenizer.pad_token_id,
+ ).squeeze(2)
+ attention_mask = input_ids.ne(self.processor.tokenizer.pad_token_id)
+
+ pixel_values = torch.cat(pixel_values, dim=0)
+ image_sizes = torch.cat(image_sizes, dim=0)
+ inputs = {
+ "input_ids": input_ids,
+ "attention_mask": attention_mask,
+ "pixel_values": pixel_values,
+ "image_sizes": image_sizes,
+ }
+
+ image_outputs = self.encode_input(inputs)
+ all_image_embeddings.append(image_outputs.cpu().to(torch.float32))
+
+ else:
+ with torch.no_grad():
+ for i in tqdm(range(0, len(images), batch_size)):
+ batch_images = images[i : i + batch_size]
+ input_ids, pixel_values, image_sizes = [], [], []
+ for b in batch_images:
+ inputs = self.processor(
+ text,
+ [b],
+ return_tensors="pt",
+ max_length=256,
+ truncation=True,
+ )
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ input_ids.append(inputs["input_ids"].squeeze(0).unsqueeze(1))
+ pixel_values.append(inputs["pixel_values"])
+ image_sizes.append(inputs["image_sizes"])
+
+ input_ids = torch._C._nn.pad_sequence(
+ input_ids,
+ batch_first=True,
+ padding_value=self.processor.tokenizer.pad_token_id,
+ ).squeeze(2)
+ attention_mask = input_ids.ne(self.processor.tokenizer.pad_token_id)
+
+ pixel_values = torch.cat(pixel_values, dim=0)
+ image_sizes = torch.cat(image_sizes, dim=0)
+ inputs = {
+ "input_ids": input_ids,
+ "attention_mask": attention_mask,
+ "pixel_values": pixel_values,
+ "image_sizes": image_sizes,
+ }
+
+ image_outputs = self.encode_input(inputs)
+ all_image_embeddings.append(image_outputs.cpu().to(torch.float32))
+
+ all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
+ return all_image_embeddings
+
+ def get_text_embeddings(
+ self,
+ texts: list[str],
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ **kwargs: Any,
+ ):
+ all_text_embeddings = []
+
+ with torch.no_grad():
+ for i in tqdm(range(0, len(texts), batch_size)):
+ input_ids = []
+ batch_texts = texts[i : i + batch_size]
+ for text in batch_texts:
+ inputs = self.processor(
+ text,
+ None,
+ return_tensors="pt",
+ max_length=256,
+ truncation=True,
+ )
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ input_ids.append(inputs["input_ids"].squeeze(0).unsqueeze(1))
+
+ input_ids = torch._C._nn.pad_sequence(
+ input_ids,
+ batch_first=True,
+ padding_value=self.processor.tokenizer.pad_token_id,
+ ).squeeze(2)
+ attention_mask = input_ids.ne(self.processor.tokenizer.pad_token_id)
+ inputs = {
+ "input_ids": input_ids,
+ "attention_mask": attention_mask,
+ }
+
+ text_outputs = self.encode_input(inputs)
+ all_text_embeddings.append(text_outputs.cpu().to(torch.float32))
+
+ all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
+ return all_text_embeddings
+
+ def calculate_probs(self, text_embeddings, image_embeddings):
+ text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
+ image_embeddings = image_embeddings / image_embeddings.norm(
+ dim=-1, keepdim=True
+ )
+ logits = torch.matmul(image_embeddings, text_embeddings.T)
+ probs = (logits * 100).softmax(dim=-1)
+ return probs
+
+ def get_fused_embeddings(
+ self,
+ texts: list[str] = None,
+ images: list[Image.Image] | DataLoader = None,
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ fusion_mode="sum",
+ **kwargs: Any,
+ ):
+ if texts is None and images is None:
+ raise ValueError("Either texts or images must be provided")
+
+ text_embeddings = None
+ image_embeddings = None
+ kwargs.update(
+ task_name=task_name, prompt_type=prompt_type, batch_size=batch_size
+ )
+
+ if texts is not None and images is None:
+ text_embeddings = self.get_text_embeddings(texts, **kwargs)
+ return text_embeddings
+
+ if images is not None and texts is None:
+ image_embeddings = self.get_image_embeddings(images, **kwargs)
+ return image_embeddings
+
+ # text_embeddings is not None and image_embeddings is not None
+ texts = iter(texts)
+ all_fused_embeddings = []
+ if isinstance(images, DataLoader):
+ import torchvision.transforms.functional as F
+
+ with torch.no_grad():
+ for batch in images:
+ input_ids, pixel_values, image_sizes = [], [], []
+ for b in batch:
+ text = next(texts)
+ inputs = self.processor(
+ f"<|image_1|> Represent the given image with the following question: {text}",
+ [F.to_pil_image(b.to("cpu"))],
+ return_tensors="pt",
+ max_length=256,
+ truncation=True,
+ )
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ input_ids.append(inputs["input_ids"].squeeze(0).unsqueeze(1))
+ pixel_values.append(inputs["pixel_values"])
+ image_sizes.append(inputs["image_sizes"])
+
+ input_ids = torch._C._nn.pad_sequence(
+ input_ids,
+ batch_first=True,
+ padding_value=self.processor.tokenizer.pad_token_id,
+ ).squeeze(2)
+ attention_mask = input_ids.ne(self.processor.tokenizer.pad_token_id)
+
+ pixel_values = torch.cat(pixel_values, dim=0)
+ image_sizes = torch.cat(image_sizes, dim=0)
+ inputs = {
+ "input_ids": input_ids,
+ "attention_mask": attention_mask,
+ "pixel_values": pixel_values,
+ "image_sizes": image_sizes,
+ }
+
+ outputs = self.encode_input(inputs)
+ all_fused_embeddings.append(outputs.cpu().to(torch.float32))
+ else:
+ with torch.no_grad():
+ for i in tqdm(range(0, len(images), batch_size)):
+ batch_images = images[i : i + batch_size]
+ input_ids, pixel_values, image_sizes = [], [], []
+ for b in batch_images:
+ text = next(texts)
+ inputs = self.processor(
+ f"<|image_1|> Represent the given image with the following question: {text}",
+ [b],
+ return_tensors="pt",
+ max_length=256,
+ truncation=True,
+ )
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ input_ids.append(inputs["input_ids"].squeeze(0).unsqueeze(1))
+ pixel_values.append(inputs["pixel_values"])
+ image_sizes.append(inputs["image_sizes"])
+
+ input_ids = torch._C._nn.pad_sequence(
+ input_ids,
+ batch_first=True,
+ padding_value=self.processor.tokenizer.pad_token_id,
+ ).squeeze(2)
+ attention_mask = input_ids.ne(self.processor.tokenizer.pad_token_id)
+
+ pixel_values = torch.cat(pixel_values, dim=0)
+ image_sizes = torch.cat(image_sizes, dim=0)
+ inputs = {
+ "input_ids": input_ids,
+ "attention_mask": attention_mask,
+ "pixel_values": pixel_values,
+ "image_sizes": image_sizes,
+ }
+
+ outputs = self.encode_input(inputs)
+ all_fused_embeddings.append(outputs.cpu().to(torch.float32))
+
+ fused_embeddings = torch.cat(all_fused_embeddings, dim=0)
+ return fused_embeddings
+
+
+vlm2vec_training_datasets = {
+ # MMEB-train
+}
+
+vlm2vec_lora = ModelMeta(
+ loader=partial(
+ VLM2VecWrapper,
+ model_name="TIGER-Lab/VLM2Vec-LoRA",
+ ),
+ name="TIGER-Lab/VLM2Vec-LoRA",
+ languages=["eng_Latn"],
+ revision="7403b6327958071c1e33c822c7453adadccc7298",
+ release_date="2024-10-08",
+ modalities=["image", "text"],
+ n_parameters=None,
+ max_tokens=131072,
+ embed_dim=3072,
+ license="apache-2.0",
+ open_weights=True,
+ public_training_code="https://github.com/TIGER-AI-Lab/VLM2Vec",
+ public_training_data="https://huggingface.co/datasets/TIGER-Lab/MMEB-train",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/TIGER-Lab/VLM2Vec-LoRA",
+ similarity_fn_name=None,
+ use_instructions=True,
+ training_datasets=vlm2vec_training_datasets,
+)
+
+vlm2vec_full = ModelMeta(
+ loader=partial(
+ VLM2VecWrapper,
+ model_name="TIGER-Lab/VLM2Vec-Full",
+ ),
+ name="TIGER-Lab/VLM2Vec-Full",
+ languages=["eng_Latn"],
+ revision="e9afa98002097ac2471827ba23ea1f2ddd229480",
+ release_date="2024-10-08",
+ modalities=["image", "text"],
+ n_parameters=4_150_000_000,
+ max_tokens=131072,
+ embed_dim=3072,
+ license="apache-2.0",
+ open_weights=True,
+ public_training_code="https://github.com/TIGER-AI-Lab/VLM2Vec",
+ public_training_data="https://huggingface.co/TIGER-Lab/VLM2Vec-Full",
+ framework=["PyTorch"],
+ reference="https://huggingface.co/TIGER-Lab/VLM2Vec-Full",
+ similarity_fn_name=None,
+ use_instructions=True,
+ training_datasets=vlm2vec_training_datasets,
+)
diff --git a/mteb/models/voyage_v.py b/mteb/models/voyage_v.py
new file mode 100644
index 0000000000..6968fec03c
--- /dev/null
+++ b/mteb/models/voyage_v.py
@@ -0,0 +1,262 @@
+from __future__ import annotations
+
+import logging
+import os
+from functools import partial
+from typing import Any
+
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+from torchvision import transforms
+from tqdm import tqdm
+
+from mteb.encoder_interface import PromptType
+from mteb.model_meta import ModelMeta
+
+api_key = os.getenv("VOYAGE_API_KEY")
+tensor_to_image = transforms.Compose([transforms.ToPILImage()])
+
+
+def downsample_image(
+ image: Image.Image, max_pixels: int = 16000000, target_longest_side: int = 4000
+) -> Image.Image:
+ """If image pixel > max_pixels, downsample it to target_longest_side while keeping the width height ratio."""
+ width, height = image.size
+ pixels = width * height
+
+ if pixels > max_pixels:
+ if width > height:
+ new_width = target_longest_side
+ new_height = int(height * (target_longest_side / width))
+ else:
+ new_height = target_longest_side
+ new_width = int(width * (target_longest_side / height))
+
+ new_size = (new_width, new_height)
+ logging.info(
+ f"Downsampling image from {width}x{height} to {new_width}x{new_height}"
+ )
+ return image.resize(new_size, Image.LANCZOS)
+ if width > height:
+ if width > 10000:
+ logging.error("Processing extremely wide images.")
+ return image.resize((10000, height), Image.LANCZOS)
+ else:
+ if height > 10000:
+ logging.error("Processing extremely high images.")
+ return image.resize((width, 10000), Image.LANCZOS)
+ return image
+
+
+def voyage_v_loader(**kwargs):
+ try:
+ import voyageai
+ except ImportError:
+ raise ImportError("To use voyage models, please run `pip install -U voyageai`.")
+ try:
+ from tenacity import retry, stop_after_attempt, wait_exponential
+ except ImportError:
+ raise ImportError(
+ "please run `pip install tenacity` to use exponential backoff."
+ )
+
+ class VoyageMultiModalModelWrapper:
+ def __init__(
+ self,
+ model_name: str,
+ **kwargs: Any,
+ ):
+ self.model_name = model_name
+ self.vo = voyageai.Client()
+
+ @retry(
+ stop=stop_after_attempt(6), # Stop after 6 attempts
+ wait=wait_exponential(multiplier=1, max=60), # Exponential backoff
+ )
+ def _multimodal_embed(self, inputs, model, input_type):
+ return self.vo.multimodal_embed(inputs, model=model, input_type=input_type)
+
+ def get_text_embeddings(
+ self,
+ texts: list[str],
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ input_type=None,
+ **kwargs: Any,
+ ):
+ if input_type is None and prompt_type is not None:
+ if prompt_type == PromptType.passage:
+ input_type = "document"
+ elif prompt_type == PromptType.query:
+ input_type = "query"
+
+ all_text_embeddings = []
+
+ batch_size = 128 # for run tasks purpose
+
+ for i in tqdm(range(0, len(texts), batch_size)):
+ batch_texts = texts[i : i + batch_size]
+ batch_texts = [[text] for text in batch_texts]
+
+ # with retry mechanism
+ embeddings = self._multimodal_embed(
+ batch_texts, model=self.model_name, input_type=input_type
+ ).embeddings
+ all_text_embeddings.append(torch.tensor(embeddings))
+ all_text_embeddings = torch.vstack(all_text_embeddings)
+ return all_text_embeddings
+
+ def get_image_embeddings(
+ self,
+ images: list[Image.Image] | DataLoader,
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ input_type=None,
+ **kwargs: Any,
+ ):
+ if input_type is None and prompt_type is not None:
+ if prompt_type == PromptType.passage:
+ input_type = "document"
+ elif prompt_type == PromptType.query:
+ input_type = "query"
+
+ all_image_embeddings = []
+
+ if isinstance(images, DataLoader):
+ for index, batch in enumerate(tqdm(images)):
+ if index == 0:
+ assert len(batch) == batch_size
+ batch_images = [
+ [downsample_image(tensor_to_image(image))] for image in batch
+ ]
+ embeddings = self._multimodal_embed(
+ batch_images, model=self.model_name, input_type=input_type
+ ).embeddings
+ all_image_embeddings.append(torch.tensor(embeddings))
+ else:
+ for i in tqdm(range(0, len(images), batch_size)):
+ batch_images = images[i : i + batch_size]
+ batch_images = [[downsample_image(image)] for image in batch_images]
+ embeddings = self._multimodal_embed(
+ batch_images, model=self.model_name, input_type=input_type
+ ).embeddings
+ all_image_embeddings.append(torch.tensor(embeddings))
+ all_image_embeddings = torch.vstack(all_image_embeddings)
+ return all_image_embeddings
+
+ def calculate_probs(self, text_embeddings, image_embeddings):
+ text_embeddings = text_embeddings / text_embeddings.norm(
+ dim=-1, keepdim=True
+ )
+ image_embeddings = image_embeddings / image_embeddings.norm(
+ dim=-1, keepdim=True
+ )
+ logits = torch.matmul(image_embeddings, text_embeddings.T)
+ probs = (logits * 100).softmax(dim=-1)
+ return probs
+
+ def get_fused_embeddings(
+ self,
+ texts: list[str] = None,
+ images: list[Image.Image] | DataLoader = None,
+ *,
+ task_name: str | None = None,
+ prompt_type: PromptType | None = None,
+ batch_size: int = 32,
+ input_type=None,
+ **kwargs: Any,
+ ):
+ if texts is None and images is None:
+ raise ValueError("Either texts or images must be provided")
+
+ if input_type is None and prompt_type is not None:
+ if prompt_type == PromptType.passage:
+ input_type = "document"
+ elif prompt_type == PromptType.query:
+ input_type = "query"
+
+ text_embeddings = None
+ image_embeddings = None
+
+ interleaved_embeddings = []
+ if texts is not None and images is not None:
+ if isinstance(images, DataLoader):
+ for index, batch in tqdm(enumerate(images)):
+ if index == 0:
+ assert len(batch) == batch_size
+ batch_images = [
+ downsample_image(tensor_to_image(image)) for image in batch
+ ]
+ batch_texts = texts[
+ index * batch_size : (index + 1) * batch_size
+ ]
+ interleaved_inputs = [
+ [text, image]
+ for image, text in zip(batch_images, batch_texts)
+ ]
+ embeddings = self._multimodal_embed(
+ interleaved_inputs,
+ model=self.model_name,
+ input_type=input_type,
+ ).embeddings
+ interleaved_embeddings.append(torch.tensor(embeddings))
+ else:
+ for i in tqdm(range(0, len(images), batch_size)):
+ batch_images = images[i : i + batch_size]
+ batch_texts = texts[i : i + batch_size]
+ interleaved_inputs = [
+ [text, image]
+ for image, text in zip(batch_images, batch_texts)
+ ]
+ embeddings = self._multimodal_embed(
+ interleaved_inputs,
+ model=self.model_name,
+ input_type=input_type,
+ ).embeddings
+ interleaved_embeddings.append(torch.tensor(embeddings))
+ interleaved_embeddings = torch.vstack(interleaved_embeddings)
+ return interleaved_embeddings
+
+ elif texts is not None:
+ text_embeddings = self.get_text_embeddings(
+ texts, batch_size, input_type=input_type
+ )
+
+ elif images is not None:
+ image_embeddings = self.get_image_embeddings(
+ images, batch_size, input_type=input_type
+ )
+
+ if text_embeddings is not None:
+ return text_embeddings
+ elif image_embeddings is not None:
+ return image_embeddings
+
+ return VoyageMultiModalModelWrapper(**kwargs)
+
+
+voyage_v = ModelMeta(
+ loader=partial(voyage_v_loader, model_name="voyage-multimodal-3"),
+ name="voyage-multimodal-3",
+ languages=[], # Unknown
+ revision="1",
+ release_date="2024-11-10",
+ n_parameters=None,
+ max_tokens=None,
+ embed_dim=1024,
+ license=None,
+ similarity_fn_name="cosine",
+ framework=[],
+ modalities=["image", "text"],
+ open_weights=None,
+ public_training_code=None,
+ public_training_data=None,
+ reference=None,
+ use_instructions=None,
+ training_datasets=None,
+)
diff --git a/mteb/normalize_embeddings.py b/mteb/normalize_embeddings.py
new file mode 100644
index 0000000000..b9ee635806
--- /dev/null
+++ b/mteb/normalize_embeddings.py
@@ -0,0 +1,31 @@
+from __future__ import annotations
+
+import numpy as np
+import torch
+
+
+def normalize_embeddings_to_numpy(
+ embeddings: torch.Tensor | np.ndarray | list[np.ndarray] | list[torch.Tensor],
+) -> np.ndarray:
+ """Normalize embeddings to be numpy arrays
+
+
+ Args:
+ embeddings: embeddings to normalize
+
+ Returns:
+ Normalized embeddings
+ """
+ if isinstance(embeddings, torch.Tensor):
+ embeddings = embeddings.cpu().detach().float().numpy()
+ elif isinstance(embeddings, list):
+ if isinstance(embeddings[0], torch.Tensor):
+ embeddings = [
+ embedding.cpu().detach().float().numpy() for embedding in embeddings
+ ]
+ elif isinstance(embeddings[0], np.ndarray):
+ embeddings = embeddings
+
+ numpy_embeddings = np.array(embeddings)
+
+ return numpy_embeddings
diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/__init__.py b/mteb/tasks/Image/Any2AnyMultiChoice/__init__.py
new file mode 100644
index 0000000000..0e3b6d4505
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyMultiChoice/__init__.py
@@ -0,0 +1,7 @@
+from __future__ import annotations
+
+from .eng.BLINKIT2IMultiChoice import *
+from .eng.BLINKIT2TMultiChoice import *
+from .eng.ImageCoDeT2IMultiChoice import *
+from .eng.ROxfordI2IMultiChoice import *
+from .eng.RParisI2IMultiChoice import *
diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2IMultiChoice.py b/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2IMultiChoice.py
new file mode 100644
index 0000000000..58db0c8c92
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2IMultiChoice.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import AbsTaskAny2AnyMultiChoice
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class BLINKIT2IMultiChoice(AbsTaskAny2AnyMultiChoice):
+ metadata = TaskMetadata(
+ name="BLINKIT2IMultiChoice",
+ description="Retrieve images based on images and specific retrieval instructions.",
+ reference="https://arxiv.org/abs/2404.12390",
+ dataset={
+ "path": "JamieSJS/blink-it2i-multi",
+ "revision": "a9f994925551c14503d00d86f1307bac6e2ead6a",
+ "trust_remote_code": True,
+ },
+ type="Any2AnyMultiChoice",
+ category="it2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2018-01-01", "2018-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{fu2024blink,
+ title={Blink: Multimodal large language models can see but not perceive},
+ author={Fu, Xingyu and Hu, Yushi and Li, Bangzheng and Feng, Yu and Wang, Haoyu and Lin, Xudong and Roth, Dan and Smith, Noah A and Ma, Wei-Chiu and Krishna, Ranjay},
+ journal={arXiv preprint arXiv:2404.12390},
+ year={2024}
+}
+""",
+ descriptive_stats={
+ "n_samples": {"test": 534},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 1200,
+ "num_queries": 534,
+ "average_relevant_docs_per_query": 1,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2TMultiChoice.py b/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2TMultiChoice.py
new file mode 100644
index 0000000000..0a1dfcdc42
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2TMultiChoice.py
@@ -0,0 +1,48 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import AbsTaskAny2AnyMultiChoice
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class BLINKIT2TMultiChoice(AbsTaskAny2AnyMultiChoice):
+ metadata = TaskMetadata(
+ name="BLINKIT2TMultiChoice",
+ description="Retrieve the correct text answer based on images and specific retrieval instructions.",
+ reference="https://arxiv.org/abs/2404.12390",
+ dataset={
+ "path": "JamieSJS/blink-it2t-multi",
+ "revision": "bc8f4c7f62450a4ceb737c8339061cf87aea42d5",
+ },
+ type="Any2AnyMultiChoice",
+ category="it2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2018-01-01", "2018-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{fu2024blink,
+ title={Blink: Multimodal large language models can see but not perceive},
+ author={Fu, Xingyu and Hu, Yushi and Li, Bangzheng and Feng, Yu and Wang, Haoyu and Lin, Xudong and Roth, Dan and Smith, Noah A and Ma, Wei-Chiu and Krishna, Ranjay},
+ journal={arXiv preprint arXiv:2404.12390},
+ year={2024}
+}
+""",
+ descriptive_stats={
+ "n_samples": {"test": 923},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 24,
+ "num_queries": 923,
+ "average_relevant_docs_per_query": 1,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/eng/ImageCoDeT2IMultiChoice.py b/mteb/tasks/Image/Any2AnyMultiChoice/eng/ImageCoDeT2IMultiChoice.py
new file mode 100644
index 0000000000..3cb875b845
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyMultiChoice/eng/ImageCoDeT2IMultiChoice.py
@@ -0,0 +1,48 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import AbsTaskAny2AnyMultiChoice
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class ImageCoDeT2IMultiChoice(AbsTaskAny2AnyMultiChoice):
+ metadata = TaskMetadata(
+ name="ImageCoDeT2IMultiChoice",
+ description="Identify the correct image from a set of similar images based on a precise caption.",
+ reference="https://aclanthology.org/2022.acl-long.241.pdf",
+ dataset={
+ "path": "JamieSJS/imagecode-multi",
+ "revision": "d28adfd8b34fefa546fdf94bdc352622b2575f6c",
+ },
+ type="Any2AnyMultiChoice",
+ category="it2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2022-05-22", "2022-05-27"), # conference dates
+ domains=["Web", "Written"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{krojer2022image,
+ title={Image retrieval from contextual descriptions},
+ author={Krojer, Benno and Adlakha, Vaibhav and Vineet, Vibhav and Goyal, Yash and Ponti, Edoardo and Reddy, Siva},
+ journal={arXiv preprint arXiv:2203.15867},
+ year={2022}
+}
+""",
+ descriptive_stats={
+ "n_samples": {"test": 2302},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 23020,
+ "num_queries": 2302,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/eng/ROxfordI2IMultiChoice.py b/mteb/tasks/Image/Any2AnyMultiChoice/eng/ROxfordI2IMultiChoice.py
new file mode 100644
index 0000000000..136848c128
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyMultiChoice/eng/ROxfordI2IMultiChoice.py
@@ -0,0 +1,142 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import AbsTaskAny2AnyMultiChoice
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class ROxfordEasyI2IMultiChoice(AbsTaskAny2AnyMultiChoice):
+ metadata = TaskMetadata(
+ name="ROxfordEasyI2IMultiChoice",
+ description="Retrieve photos of landmarks in Oxford, UK.",
+ reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html",
+ dataset={
+ "path": "JamieSJS/r-oxford-easy-multi",
+ "revision": "4c167c3ce529f19457c9b8e694258cc6cf8e7cc7",
+ },
+ type="Any2AnyMultiChoice",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2009-01-01", "2010-04-01"),
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{radenovic2018revisiting,
+ title={Revisiting oxford and paris: Large-scale image MultiChoice benchmarking},
+ author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5706--5715},
+ year={2018}
+}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 70},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 516,
+ "num_queries": 70,
+ "average_relevant_docs_per_query": 43.3,
+ }
+ },
+ },
+ )
+ skip_first_result = False
+
+
+class ROxfordMediumI2IMultiChoice(AbsTaskAny2AnyMultiChoice):
+ metadata = TaskMetadata(
+ name="ROxfordMediumI2IMultiChoice",
+ description="Retrieve photos of landmarks in Oxford, UK.",
+ reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html",
+ dataset={
+ "path": "JamieSJS/r-oxford-medium-multi",
+ "revision": "83bd440268e200a4f60313070618e3f45000fa94",
+ },
+ type="Any2AnyMultiChoice",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2009-01-01", "2010-04-01"),
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{radenovic2018revisiting,
+ title={Revisiting oxford and paris: Large-scale image MultiChoice benchmarking},
+ author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5706--5715},
+ year={2018}
+}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 70},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 788,
+ "num_queries": 70,
+ "average_relevant_docs_per_query": 78.9,
+ }
+ },
+ },
+ )
+ skip_first_result = False
+
+
+class ROxfordHardI2IMultiChoice(AbsTaskAny2AnyMultiChoice):
+ metadata = TaskMetadata(
+ name="ROxfordHardI2IMultiChoice",
+ description="Retrieve photos of landmarks in Oxford, UK.",
+ reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html",
+ dataset={
+ "path": "JamieSJS/r-oxford-hard-multi",
+ "revision": "fc7c4ae6655b1e6b132f3b262a359acef42dfce8",
+ },
+ type="Any2AnyMultiChoice",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2009-01-01", "2010-04-01"),
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{radenovic2018revisiting,
+ title={Revisiting oxford and paris: Large-scale image MultiChoice benchmarking},
+ author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5706--5715},
+ year={2018}
+}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 70},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 685,
+ "num_queries": 70,
+ "average_relevant_docs_per_query": 35.7,
+ }
+ },
+ },
+ )
+ skip_first_result = False
diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/eng/RParisI2IMultiChoice.py b/mteb/tasks/Image/Any2AnyMultiChoice/eng/RParisI2IMultiChoice.py
new file mode 100644
index 0000000000..69da75118f
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyMultiChoice/eng/RParisI2IMultiChoice.py
@@ -0,0 +1,142 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import AbsTaskAny2AnyMultiChoice
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class RParisEasyI2IMultiChoice(AbsTaskAny2AnyMultiChoice):
+ metadata = TaskMetadata(
+ name="RParisEasyI2IMultiChoice",
+ description="Retrieve photos of landmarks in Paris, UK.",
+ reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html",
+ dataset={
+ "path": "JamieSJS/r-paris-easy-multi",
+ "revision": "db94b5afd0014ab8c978f20a0fbcc52da1612a08",
+ },
+ type="Any2AnyMultiChoice",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2009-01-01", "2010-04-01"),
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{radenovic2018revisiting,
+ title={Revisiting paris and paris: Large-scale image MultiChoice benchmarking},
+ author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5706--5715},
+ year={2018}
+}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 70},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 516,
+ "num_queries": 70,
+ "average_relevant_docs_per_query": 43.3,
+ }
+ },
+ },
+ )
+ skip_first_result = False
+
+
+class RParisMediumI2IMultiChoice(AbsTaskAny2AnyMultiChoice):
+ metadata = TaskMetadata(
+ name="RParisMediumI2IMultiChoice",
+ description="Retrieve photos of landmarks in Paris, UK.",
+ reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html",
+ dataset={
+ "path": "JamieSJS/r-paris-medium-multi",
+ "revision": "372c79fc823e1cebc1d55f8e0039aa239285e177",
+ },
+ type="Any2AnyMultiChoice",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2009-01-01", "2010-04-01"),
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{radenovic2018revisiting,
+ title={Revisiting paris and paris: Large-scale image MultiChoice benchmarking},
+ author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5706--5715},
+ year={2018}
+}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 70},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 788,
+ "num_queries": 70,
+ "average_relevant_docs_per_query": 78.9,
+ }
+ },
+ },
+ )
+ skip_first_result = False
+
+
+class RParisHardI2IMultiChoice(AbsTaskAny2AnyMultiChoice):
+ metadata = TaskMetadata(
+ name="RParisHardI2IMultiChoice",
+ description="Retrieve photos of landmarks in Paris, UK.",
+ reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html",
+ dataset={
+ "path": "JamieSJS/r-paris-hard-multi",
+ "revision": "4e5997e48fb2f2f8bf1c8973851dedeb17e09a83",
+ },
+ type="Any2AnyMultiChoice",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2009-01-01", "2010-04-01"),
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{radenovic2018revisiting,
+ title={Revisiting paris and paris: Large-scale image MultiChoice benchmarking},
+ author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5706--5715},
+ year={2018}
+}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 70},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 685,
+ "num_queries": 70,
+ "average_relevant_docs_per_query": 35.7,
+ }
+ },
+ },
+ )
+ skip_first_result = False
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/__init__.py b/mteb/tasks/Image/Any2AnyRetrieval/__init__.py
new file mode 100644
index 0000000000..9628a41d84
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/__init__.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from .eng.BLINKIT2IRetrieval import *
+from .eng.BLINKIT2TRetrieval import *
+from .eng.CIRRIT2IRetrieval import *
+from .eng.CUB200I2IRetrieval import *
+from .eng.EDIST2ITRetrieval import *
+from .eng.EncyclopediaVQAIT2ITRetrieval import *
+from .eng.Fashion200kI2TRetrieval import *
+from .eng.Fashion200kT2IRetrieval import *
+from .eng.FashionIQIT2IRetrieval import *
+from .eng.Flickr30kI2TRetrieval import *
+from .eng.Flickr30kT2IRetrieval import *
+from .eng.FORBI2IRetrieval import *
+from .eng.GLDv2I2IRetrieval import *
+from .eng.GLDv2I2TRetrieval import *
+from .eng.HatefulMemesI2TRetrieval import *
+from .eng.HatefulMemesT2IRetrieval import *
+from .eng.ImageCoDeT2IRetrieval import *
+from .eng.InfoSeekIT2ITRetrieval import *
+from .eng.InfoSeekIT2TRetrieval import *
+from .eng.LLaVAIT2TRetrieval import *
+from .eng.MemotionI2TRetrieval import *
+from .eng.MemotionT2IRetrieval import *
+from .eng.METI2IRetrieval import *
+from .eng.MSCOCOI2TRetrieval import *
+from .eng.MSCOCOT2IRetrieval import *
+from .eng.NIGHTSI2IRetrieval import *
+from .eng.OKVQAIT2TRetrieval import *
+from .eng.OVENIT2ITRetrieval import *
+from .eng.OVENIT2TRetrieval import *
+from .eng.ReMuQIT2TRetrieval import *
+from .eng.ROxfordI2IRetrieval import *
+from .eng.RP2kI2IRetrieval import *
+from .eng.RParisI2IRetrieval import *
+from .eng.SciMMIRI2TRetrieval import *
+from .eng.SciMMIRT2IRetrieval import *
+from .eng.SketchyI2IRetrieval import *
+from .eng.SOPI2IRetrieval import *
+from .eng.StanfordCarsI2IRetrieval import *
+from .eng.TUBerlinT2IRetrieval import *
+from .eng.VidoreBenchRetrieval import *
+from .eng.VisualNewsI2TRetrieval import *
+from .eng.VisualNewsT2IRetrieval import *
+from .eng.VizWizIT2TRetrieval import *
+from .eng.VQA2IT2TRetrieval import *
+from .eng.WebQAT2ITRetrieval import *
+from .eng.WebQAT2TRetrieval import *
+from .multilingual.WITT2IRetrieval import *
+from .multilingual.XFlickr30kCoT2IRetrieval import *
+from .multilingual.XM3600T2IRetrieval import *
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/BLINKIT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/BLINKIT2IRetrieval.py
new file mode 100644
index 0000000000..8202bb133c
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/BLINKIT2IRetrieval.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class BLINKIT2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="BLINKIT2IRetrieval",
+ description="Retrieve images based on images and specific retrieval instructions.",
+ reference="https://arxiv.org/abs/2404.12390",
+ dataset={
+ "path": "JamieSJS/blink-it2i",
+ "revision": "7a1a1330565faca9c1aeec6f5acfc64f21296753",
+ "trust_remote_code": True,
+ },
+ type="Any2AnyRetrieval",
+ category="it2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2018-01-01", "2018-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{fu2024blink,
+ title={Blink: Multimodal large language models can see but not perceive},
+ author={Fu, Xingyu and Hu, Yushi and Li, Bangzheng and Feng, Yu and Wang, Haoyu and Lin, Xudong and Roth, Dan and Smith, Noah A and Ma, Wei-Chiu and Krishna, Ranjay},
+ journal={arXiv preprint arXiv:2404.12390},
+ year={2024}
+}
+""",
+ descriptive_stats={
+ "n_samples": {"test": 402},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 804,
+ "num_queries": 402,
+ "average_relevant_docs_per_query": 1,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/BLINKIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/BLINKIT2TRetrieval.py
new file mode 100644
index 0000000000..ff6ec42427
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/BLINKIT2TRetrieval.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class BLINKIT2TRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="BLINKIT2TRetrieval",
+ description="Retrieve images based on images and specific retrieval instructions.",
+ reference="https://arxiv.org/abs/2404.12390",
+ dataset={
+ "path": "JamieSJS/blink-it2t",
+ "revision": "c6470936de49d6d2ae5fc09612752c75175ce5b6",
+ "trust_remote_code": True,
+ },
+ type="Any2AnyRetrieval",
+ category="it2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2018-01-01", "2018-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{fu2024blink,
+ title={Blink: Multimodal large language models can see but not perceive},
+ author={Fu, Xingyu and Hu, Yushi and Li, Bangzheng and Feng, Yu and Wang, Haoyu and Lin, Xudong and Roth, Dan and Smith, Noah A and Ma, Wei-Chiu and Krishna, Ranjay},
+ journal={arXiv preprint arXiv:2404.12390},
+ year={2024}
+}
+""",
+ descriptive_stats={
+ "n_samples": {"test": 1073},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 26,
+ "num_queries": 1073,
+ "average_relevant_docs_per_query": 1,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/CIRRIT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/CIRRIT2IRetrieval.py
new file mode 100644
index 0000000000..ed0172ae79
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/CIRRIT2IRetrieval.py
@@ -0,0 +1,52 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class CIRRIT2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="CIRRIT2IRetrieval",
+ description="Retrieve images based on texts and images.",
+ reference="https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Image_Retrieval_on_Real-Life_Images_With_Pre-Trained_Vision-and-Language_Models_ICCV_2021_paper.html",
+ dataset={
+ "path": "MRBench/mbeir_cirr_task7",
+ "revision": "503301cd99348035b9675883a543aa1ded0cf07c",
+ "trust_remote_code": True,
+ },
+ type="Any2AnyRetrieval",
+ category="it2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2018-01-01", "2018-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@inproceedings{liu2021image,
+ title={Image retrieval on real-life images with pre-trained vision-and-language models},
+ author={Liu, Zheyuan and Rodriguez-Opazo, Cristian and Teney, Damien and Gould, Stephen},
+ booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+ pages={2125--2134},
+ year={2021}
+ }""",
+ prompt={
+ "query": "Retrieve a day-to-day image that aligns with the modification instructions of the provided image."
+ },
+ descriptive_stats={
+ "n_samples": {"test": 4170},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 21551,
+ "num_queries": 4170,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/CUB200I2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/CUB200I2IRetrieval.py
new file mode 100644
index 0000000000..95a3c9a77c
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/CUB200I2IRetrieval.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class CUB200I2I(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="CUB200I2IRetrieval",
+ description="Retrieve bird images from 200 classes.",
+ reference="https://www.florian-schroff.de/publications/CUB-200.pdf",
+ dataset={
+ "path": "isaacchung/cub200_retrieval",
+ "revision": "ad08c1307b15a226bf1b64e62656a17f1f85f7ec",
+ },
+ type="Any2AnyRetrieval",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cv_recall_at_1",
+ date=("2009-01-01", "2010-04-01"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@article{article,
+ author = {Welinder, Peter and Branson, Steve and Mita, Takeshi and Wah, Catherine and Schroff, Florian and Belongie, Serge and Perona, Pietro},
+ year = {2010},
+ month = {09},
+ pages = {},
+ title = {Caltech-UCSD Birds 200}
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"default": 5794},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 1074.894348894349,
+ "average_query_length": 77.06142506142506,
+ "num_documents": 5794,
+ "num_queries": 5794,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
+ skip_first_result = True
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/EDIST2ITRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/EDIST2ITRetrieval.py
new file mode 100644
index 0000000000..ac7b310998
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/EDIST2ITRetrieval.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class EDIST2ITRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="EDIST2ITRetrieval",
+ description="Retrieve news images and titles based on news content.",
+ reference="https://aclanthology.org/2023.emnlp-main.297/",
+ dataset={
+ "path": "MRBench/mbeir_edis_task2",
+ "revision": "68c47ef3e49ef883073b3358bd4243eeca0aee9a",
+ },
+ type="Any2AnyRetrieval",
+ category="t2it",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2023-01-01", "2023-12-31"),
+ domains=["News"],
+ task_subtypes=["Image Text Retrieval"],
+ license="apache-2.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{liu2023edis,
+ title={EDIS: Entity-Driven Image Search over Multimodal Web Content},
+ author={Liu, Siqi and Feng, Weixi and Fu, Tsu-Jui and Chen, Wenhu and Wang, William},
+ booktitle={Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing},
+ pages={4877--4894},
+ year={2023}
+}""",
+ prompt={"query": "Identify the news photo for the given caption."},
+ descriptive_stats={
+ "n_samples": {"test": 3241},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 1047067,
+ "num_queries": 3241,
+ "average_relevant_docs_per_query": 2.57,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/EncyclopediaVQAIT2ITRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/EncyclopediaVQAIT2ITRetrieval.py
new file mode 100644
index 0000000000..01f2e6a980
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/EncyclopediaVQAIT2ITRetrieval.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class EncyclopediaVQAIT2ITRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="EncyclopediaVQAIT2ITRetrieval",
+ description="Retrieval Wiki passage and image and passage to answer query about an image.",
+ reference="https://github.com/google-research/google-research/tree/master/encyclopedic_vqa",
+ dataset={
+ "path": "izhx/UMRB-EncyclopediaVQA",
+ "revision": "d6eae4f06e260664eb3f276fd1bdb5d4d4c9f32b",
+ },
+ type="Any2AnyRetrieval",
+ category="it2it",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cv_recall_at_5",
+ date=("2023-01-01", "2023-07-20"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image", "text"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{mensink2023encyclopedic,
+ title={Encyclopedic VQA: Visual questions about detailed properties of fine-grained categories},
+ author={Mensink, Thomas and Uijlings, Jasper and Castrejon, Lluis and Goel, Arushi and Cadar, Felipe and Zhou, Howard and Sha, Fei and Araujo, Andr{\'e} and Ferrari, Vittorio},
+ booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+ pages={3113--3124},
+ year={2023}
+}""",
+ prompt={
+ "query": "Obtain illustrated documents that correspond to the inquiry alongside the provided image."
+ },
+ descriptive_stats={
+ "n_samples": {"test": 3743},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 1294.368802424136,
+ "average_query_length": 51.703713598717606,
+ "num_documents": 68313,
+ "num_queries": 3743,
+ "average_relevant_docs_per_query": 1.3056371894202512,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/FORBI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/FORBI2IRetrieval.py
new file mode 100644
index 0000000000..a07a88b7b8
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/FORBI2IRetrieval.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class FORBI2I(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="FORBI2IRetrieval",
+ description="Retrieve flat object images from 8 classes.",
+ reference="https://github.com/pxiangwu/FORB",
+ dataset={
+ "path": "isaacchung/forb_retrieval",
+ "revision": "26ab4bd972854becada339afc80f5f3ffc047e2b",
+ },
+ type="Any2AnyRetrieval",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cv_recall_at_1",
+ date=("2022-01-01", "2023-01-01"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@misc{wu2023forbflatobjectretrieval,
+ title={FORB: A Flat Object Retrieval Benchmark for Universal Image Embedding},
+ author={Pengxiang Wu and Siman Wang and Kevin Dela Rosa and Derek Hao Hu},
+ year={2023},
+ eprint={2309.16249},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV},
+ url={https://arxiv.org/abs/2309.16249},
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"default": 13250},
+ "avg_character_length": {
+ "test": {
+ "num_documents": 53984,
+ "num_queries": 13250,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kI2TRetrieval.py
new file mode 100644
index 0000000000..5ba43daf1d
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kI2TRetrieval.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class Fashion200kI2TRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="Fashion200kI2TRetrieval",
+ description="Retrieve clothes based on descriptions.",
+ reference="https://openaccess.thecvf.com/content_iccv_2017/html/Han_Automatic_Spatially-Aware_Fashion_ICCV_2017_paper.html",
+ dataset={
+ "path": "MRBench/mbeir_fashion200k_task3",
+ "revision": "96a313715ecf67f5dfe70c4fa52406bc7bdfbeee",
+ },
+ type="Any2AnyRetrieval",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2017-01-01", "2017-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="apache-2.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{han2017automatic,
+ title={Automatic spatially-aware fashion concept discovery},
+ author={Han, Xintong and Wu, Zuxuan and Huang, Phoenix X and Zhang, Xiao and Zhu, Menglong and Li, Yuan and Zhao, Yang and Davis, Larry S},
+ booktitle={Proceedings of the IEEE international conference on computer vision},
+ pages={1463--1471},
+ year={2017}
+}""",
+ prompt={
+ "query": "Based on the following fashion description, retrieve the best matching image."
+ },
+ descriptive_stats={
+ "n_samples": {"test": 4889},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 61707,
+ "num_queries": 4889,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kT2IRetrieval.py
new file mode 100644
index 0000000000..1511de7aa4
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kT2IRetrieval.py
@@ -0,0 +1,52 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class Fashion200kT2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="Fashion200kT2IRetrieval",
+ description="Retrieve clothes based on descriptions.",
+ reference="https://openaccess.thecvf.com/content_iccv_2017/html/Han_Automatic_Spatially-Aware_Fashion_ICCV_2017_paper.html",
+ dataset={
+ "path": "MRBench/mbeir_fashion200k_task0",
+ "revision": "1b86e2dde50e671d5c83d07a79e8b1d8c696964b",
+ # "trust_remote_code": True,
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2017-01-01", "2017-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="apache-2.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{han2017automatic,
+ title={Automatic spatially-aware fashion concept discovery},
+ author={Han, Xintong and Wu, Zuxuan and Huang, Phoenix X and Zhang, Xiao and Zhu, Menglong and Li, Yuan and Zhao, Yang and Davis, Larry S},
+ booktitle={Proceedings of the IEEE international conference on computer vision},
+ pages={1463--1471},
+ year={2017}
+}""",
+ prompt={
+ "query": "Based on the following fashion description, retrieve the best matching image."
+ },
+ descriptive_stats={
+ "n_samples": {"test": 1719},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 30.94235294117647,
+ "average_query_length": 131.56569965870307,
+ "num_documents": 201824,
+ "num_queries": 1719,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/FashionIQIT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/FashionIQIT2IRetrieval.py
new file mode 100644
index 0000000000..4e1209c23c
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/FashionIQIT2IRetrieval.py
@@ -0,0 +1,52 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class FashionIQIT2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="FashionIQIT2IRetrieval",
+ description="Retrieve clothes based on descriptions.",
+ reference="https://openaccess.thecvf.com/content/CVPR2021/html/Wu_Fashion_IQ_A_New_Dataset_Towards_Retrieving_Images_by_Natural_CVPR_2021_paper.html",
+ dataset={
+ "path": "MRBench/mbeir_fashioniq_task7",
+ "revision": "e6f0ec70becc413d940cd62b2cfa3b1d3a08c31a",
+ # "trust_remote_code": True,
+ },
+ type="Any2AnyRetrieval",
+ category="it2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2021-01-01", "2021-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="apache-2.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{wu2021fashion,
+ title={Fashion iq: A new dataset towards retrieving images by natural language feedback},
+ author={Wu, Hui and Gao, Yupeng and Guo, Xiaoxiao and Al-Halah, Ziad and Rennie, Steven and Grauman, Kristen and Feris, Rogerio},
+ booktitle={Proceedings of the IEEE/CVF Conference on computer vision and pattern recognition},
+ pages={11307--11317},
+ year={2021}
+}""",
+ prompt={
+ "query": "Find a fashion image that aligns with the reference image and style note."
+ },
+ descriptive_stats={
+ "n_samples": {"test": 6003},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 74381,
+ "num_queries": 6003,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kI2TRetrieval.py
new file mode 100644
index 0000000000..43aeea20d4
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kI2TRetrieval.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class Flickr30kI2TRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="Flickr30kI2TRetrieval",
+ description="Retrieve captions based on images.",
+ reference="https://www.semanticscholar.org/paper/From-image-descriptions-to-visual-denotations%3A-New-Young-Lai/44040913380206991b1991daf1192942e038fe31",
+ dataset={
+ "path": "isaacchung/flickr30ki2t",
+ "revision": "6984df6bd4380034e7766d9a992d8907df363efb",
+ },
+ type="Any2AnyRetrieval",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2018-01-01", "2018-12-31"),
+ domains=["Web", "Written"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{Young2014FromID,
+ title={From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions},
+ author={Peter Young and Alice Lai and Micah Hodosh and J. Hockenmaier},
+ journal={Transactions of the Association for Computational Linguistics},
+ year={2014},
+ volume={2},
+ pages={67-78},
+ url={https://api.semanticscholar.org/CorpusID:3104920}
+}""",
+ prompt={"query": "Find an image caption describing the following image."},
+ descriptive_stats={
+ "n_samples": {"test": 1000},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 5000,
+ "num_queries": 1000,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kT2IRetrieval.py
new file mode 100644
index 0000000000..cb87cfcf86
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kT2IRetrieval.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class Flickr30kT2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="Flickr30kT2IRetrieval",
+ description="Retrieve images based on captions.",
+ reference="https://www.semanticscholar.org/paper/From-image-descriptions-to-visual-denotations%3A-New-Young-Lai/44040913380206991b1991daf1192942e038fe31",
+ dataset={
+ "path": "isaacchung/flickr30kt2i",
+ "revision": "e819702b287bfbe084e129a61f308a802b7c108e",
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2018-01-01", "2018-12-31"),
+ domains=["Web", "Written"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{Young2014FromID,
+ title={From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions},
+ author={Peter Young and Alice Lai and Micah Hodosh and J. Hockenmaier},
+ journal={Transactions of the Association for Computational Linguistics},
+ year={2014},
+ volume={2},
+ pages={67-78},
+ url={https://api.semanticscholar.org/CorpusID:3104920}
+}""",
+ prompt={"query": "Find an image that matches the given caption."},
+ descriptive_stats={
+ "n_samples": {"test": 5000},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 1000,
+ "num_queries": 5000,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/GLDv2I2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/GLDv2I2IRetrieval.py
new file mode 100644
index 0000000000..1d0c2c3bcf
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/GLDv2I2IRetrieval.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class GLDv2I2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="GLDv2I2IRetrieval",
+ description="Retrieve names of landmarks based on their image.",
+ reference="https://openaccess.thecvf.com/content_CVPR_2020/html/Weyand_Google_Landmarks_Dataset_v2_-_A_Large-Scale_Benchmark_for_Instance-Level_CVPR_2020_paper.html",
+ dataset={
+ "path": "gowitheflow/gld-v2",
+ "revision": "c6b162ee349adb293901128a18c0b446f7b43457",
+ },
+ type="Any2AnyRetrieval",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2017-01-01", "2017-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="apache-2.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@InProceedings{Weyand_2020_CVPR,
+author = {Weyand, Tobias and Araujo, Andre and Cao, Bingyi and Sim, Jack},
+title = {Google Landmarks Dataset v2 - A Large-Scale Benchmark for Instance-Level Recognition and Retrieval},
+booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+month = {June},
+year = {2020}
+}
+
+""",
+ descriptive_stats={
+ "n_samples": {"test": 1129},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 847728,
+ "num_queries": 1129,
+ "average_relevant_docs_per_query": 13.49,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/GLDv2I2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/GLDv2I2TRetrieval.py
new file mode 100644
index 0000000000..db61790fb8
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/GLDv2I2TRetrieval.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class GLDv2I2TRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="GLDv2I2TRetrieval",
+ description="Retrieve names of landmarks based on their image.",
+ reference="https://openaccess.thecvf.com/content_CVPR_2020/html/Weyand_Google_Landmarks_Dataset_v2_-_A_Large-Scale_Benchmark_for_Instance-Level_CVPR_2020_paper.html",
+ dataset={
+ "path": "JamieSJS/gld-v2-i2t",
+ "revision": "d8c3e53160860f76de73ed3041a8593672fe5928",
+ },
+ type="Any2AnyRetrieval",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2017-01-01", "2017-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="apache-2.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="created",
+ bibtex_citation="""@InProceedings{Weyand_2020_CVPR,
+author = {Weyand, Tobias and Araujo, Andre and Cao, Bingyi and Sim, Jack},
+title = {Google Landmarks Dataset v2 - A Large-Scale Benchmark for Instance-Level Recognition and Retrieval},
+booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+month = {June},
+year = {2020}
+}
+
+""",
+ descriptive_stats={
+ "n_samples": {"test": 1972},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 674,
+ "num_queries": 1972,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesI2TRetrieval.py
new file mode 100644
index 0000000000..bf7e273d73
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesI2TRetrieval.py
@@ -0,0 +1,116 @@
+from __future__ import annotations
+
+from datasets import concatenate_datasets, load_dataset
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = None):
+ corpus = {}
+ queries = {}
+ relevant_docs = {}
+
+ dataset = load_dataset(
+ path,
+ cache_dir=cache_dir,
+ revision=revision,
+ )
+ dataset_splits = list(dataset)
+ shared_corpus = concatenate_datasets([dataset[split] for split in dataset_splits])
+
+ shared_corpus = shared_corpus.map(
+ lambda x: {
+ "id": "corpus-" + str(x["id"]),
+ # "text": x["text"],
+ "modality": "text",
+ "image": None,
+ },
+ remove_columns=[
+ "split",
+ "label",
+ ],
+ )
+
+ for split in splits:
+ corpus[split] = shared_corpus
+ split_dataset = dataset[split]
+ queries[split] = split_dataset.map(
+ lambda x: {
+ "id": "query-" + str(x["id"]),
+ "text": None,
+ "modality": "image",
+ # "image": x["image"],
+ },
+ remove_columns=[
+ "split",
+ "label",
+ ],
+ )
+ relevant_docs[split] = {}
+ for example in split_dataset:
+ query_id = "query-" + str(example["id"])
+ doc_id = "corpus-" + str(example["id"])
+ if query_id not in relevant_docs[split]:
+ relevant_docs[split][query_id] = {}
+ relevant_docs[split][query_id][doc_id] = 1
+
+ return corpus, queries, relevant_docs
+
+
+class HatefulMemesI2TRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="HatefulMemesI2TRetrieval",
+ description="Retrieve captions based on memes.",
+ reference="https://arxiv.org/pdf/2005.04790",
+ dataset={
+ "path": "Ahren09/MMSoc_HatefulMemes",
+ "revision": "c9a9a6c3ef0765622a6de0af6ebb68f323ad73ba",
+ # "trust_remote_code": True,
+ },
+ type="Any2AnyRetrieval",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2020-01-01", "2020-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{kiela2020hateful,
+ title={The hateful memes challenge: Detecting hate speech in multimodal memes},
+ author={Kiela, Douwe and Firooz, Hamed and Mohan, Aravind and Goswami, Vedanuj and Singh, Amanpreet and Ringshia, Pratik and Testuggine, Davide},
+ journal={Advances in neural information processing systems},
+ volume={33},
+ pages={2611--2624},
+ year={2020}
+}""",
+ descriptive_stats={
+ "n_samples": None,
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 61.0257,
+ "average_query_length": 0,
+ "num_documents": 10000,
+ "num_queries": 1000,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ if self.data_loaded:
+ return
+ self.corpus, self.queries, self.relevant_docs = _load_data(
+ path=self.metadata_dict["dataset"]["path"],
+ splits=self.metadata_dict["eval_splits"],
+ cache_dir=kwargs.get("cache_dir", None),
+ revision=self.metadata_dict["dataset"]["revision"],
+ )
+
+ self.data_loaded = True
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesT2IRetrieval.py
new file mode 100644
index 0000000000..89912a1213
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesT2IRetrieval.py
@@ -0,0 +1,116 @@
+from __future__ import annotations
+
+from datasets import concatenate_datasets, load_dataset
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = None):
+ corpus = {}
+ queries = {}
+ relevant_docs = {}
+
+ dataset = load_dataset(
+ path,
+ cache_dir=cache_dir,
+ revision=revision,
+ )
+ dataset_splits = list(dataset)
+ shared_corpus = concatenate_datasets([dataset[split] for split in dataset_splits])
+
+ shared_corpus = shared_corpus.map(
+ lambda x: {
+ "id": "corpus-" + str(x["id"]),
+ "text": None,
+ "modality": "image",
+ # "image": None,
+ },
+ remove_columns=[
+ "split",
+ "label",
+ ],
+ )
+
+ for split in splits:
+ corpus[split] = shared_corpus
+ split_dataset = dataset[split]
+ queries[split] = split_dataset.map(
+ lambda x: {
+ "id": "query-" + str(x["id"]),
+ # "text": None,
+ "modality": "text",
+ "image": None,
+ },
+ remove_columns=[
+ "split",
+ "label",
+ ],
+ )
+ relevant_docs[split] = {}
+ for example in split_dataset:
+ query_id = "query-" + str(example["id"])
+ doc_id = "corpus-" + str(example["id"])
+ if query_id not in relevant_docs[split]:
+ relevant_docs[split][query_id] = {}
+ relevant_docs[split][query_id][doc_id] = 1
+
+ return corpus, queries, relevant_docs
+
+
+class HatefulMemesT2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="HatefulMemesT2IRetrieval",
+ description="Retrieve captions based on memes.",
+ reference="https://arxiv.org/pdf/2005.04790",
+ dataset={
+ "path": "Ahren09/MMSoc_HatefulMemes",
+ "revision": "c9a9a6c3ef0765622a6de0af6ebb68f323ad73ba",
+ # "trust_remote_code": True,
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2020-01-01", "2020-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{kiela2020hateful,
+ title={The hateful memes challenge: Detecting hate speech in multimodal memes},
+ author={Kiela, Douwe and Firooz, Hamed and Mohan, Aravind and Goswami, Vedanuj and Singh, Amanpreet and Ringshia, Pratik and Testuggine, Davide},
+ journal={Advances in neural information processing systems},
+ volume={33},
+ pages={2611--2624},
+ year={2020}
+}""",
+ descriptive_stats={
+ "n_samples": None,
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0,
+ "average_query_length": 61.0257,
+ "num_documents": 10000,
+ "num_queries": 1000,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ if self.data_loaded:
+ return
+ self.corpus, self.queries, self.relevant_docs = _load_data(
+ path=self.metadata_dict["dataset"]["path"],
+ splits=self.metadata_dict["eval_splits"],
+ cache_dir=kwargs.get("cache_dir", None),
+ revision=self.metadata_dict["dataset"]["revision"],
+ )
+
+ self.data_loaded = True
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/ImageCoDeT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/ImageCoDeT2IRetrieval.py
new file mode 100644
index 0000000000..1b8472294c
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/ImageCoDeT2IRetrieval.py
@@ -0,0 +1,48 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class ImageCoDeT2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="ImageCoDeT2IRetrieval",
+ description="Retrieve a specific video frame based on a precise caption.",
+ reference="https://aclanthology.org/2022.acl-long.241.pdf",
+ dataset={
+ "path": "JamieSJS/imagecode",
+ "revision": "a424cd523ffb157b69a875fb5e71c1d51be54089",
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2022-05-22", "2022-05-27"), # conference dates
+ domains=["Web", "Written"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{krojer2022image,
+ title={Image retrieval from contextual descriptions},
+ author={Krojer, Benno and Adlakha, Vaibhav and Vineet, Vibhav and Goyal, Yash and Ponti, Edoardo and Reddy, Siva},
+ journal={arXiv preprint arXiv:2203.15867},
+ year={2022}
+}
+""",
+ descriptive_stats={
+ "n_samples": {"test": 2302},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 23020,
+ "num_queries": 2302,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2ITRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2ITRetrieval.py
new file mode 100644
index 0000000000..f695de1d19
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2ITRetrieval.py
@@ -0,0 +1,52 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class InfoSeekIT2ITRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="InfoSeekIT2ITRetrieval",
+ description="Retrieve source text and image information to answer questions about images.",
+ reference="https://aclanthology.org/2023.emnlp-main.925",
+ dataset={
+ "path": "MRBench/mbeir_infoseek_task8",
+ "revision": "78ee7f7708aac75d3afac5dcab1c9e03cb62664c",
+ "trust_remote_code": True,
+ },
+ type="Any2AnyRetrieval",
+ category="it2it",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2023-01-01", "2023-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@inproceedings{chen2023can,
+ title={Can Pre-trained Vision and Language Models Answer Visual Information-Seeking Questions?},
+ author={Chen, Yang and Hu, Hexiang and Luan, Yi and Sun, Haitian and Changpinyo, Soravit and Ritter, Alan and Chang, Ming-Wei},
+ booktitle={Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing},
+ pages={14948--14968},
+ year={2023}
+}""",
+ prompt={
+ "query": "Find an image and subject description from Wikipedia that answers my question about this image."
+ },
+ descriptive_stats={
+ "n_samples": {"test": 17593},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 481782,
+ "num_queries": 17593,
+ "average_relevant_docs_per_query": 7.5,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2TRetrieval.py
new file mode 100644
index 0000000000..e5cecd8591
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2TRetrieval.py
@@ -0,0 +1,52 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class InfoSeekIT2TRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="InfoSeekIT2TRetrieval",
+ description="Retrieve source information to answer questions about images.",
+ reference="https://aclanthology.org/2023.emnlp-main.925",
+ dataset={
+ "path": "MRBench/mbeir_infoseek_task6",
+ "revision": "d4f4606f7a42bbf311c2957419ef3734fe81c47f",
+ "trust_remote_code": True,
+ },
+ type="Any2AnyRetrieval",
+ category="it2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2023-01-01", "2023-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@inproceedings{chen2023can,
+ title={Can Pre-trained Vision and Language Models Answer Visual Information-Seeking Questions?},
+ author={Chen, Yang and Hu, Hexiang and Luan, Yi and Sun, Haitian and Changpinyo, Soravit and Ritter, Alan and Chang, Ming-Wei},
+ booktitle={Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing},
+ pages={14948--14968},
+ year={2023}
+}""",
+ prompt={
+ "query": "Find a paragraph from Wikipedia that answers my question about this image."
+ },
+ descriptive_stats={
+ "n_samples": {"test": 11323},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 611651,
+ "num_queries": 11323,
+ "average_relevant_docs_per_query": 6.5,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/LLaVAIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/LLaVAIT2TRetrieval.py
new file mode 100644
index 0000000000..9a0ded2203
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/LLaVAIT2TRetrieval.py
@@ -0,0 +1,62 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class LLaVAIT2TRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="LLaVAIT2TRetrieval",
+ description="Retrieve responses to answer questions about images.",
+ reference="https://github.com/LinWeizheDragon/FLMR/blob/main/docs/Datasets.md",
+ dataset={
+ "path": "izhx/UMRB-LLaVA",
+ "revision": "2a5ed414aab388d8cdd244ba2cf8c8960df4d44d",
+ },
+ type="Any2AnyRetrieval",
+ category="it2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cv_recall_at_5",
+ date=("2024-07-06", "2024-02-26"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@inproceedings{lin-etal-2024-preflmr,
+ title = "{P}re{FLMR}: Scaling Up Fine-Grained Late-Interaction Multi-modal Retrievers",
+ author = "Lin, Weizhe and
+ Mei, Jingbiao and
+ Chen, Jinghong and
+ Byrne, Bill",
+ editor = "Ku, Lun-Wei and
+ Martins, Andre and
+ Srikumar, Vivek",
+ booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
+ month = aug,
+ year = "2024",
+ address = "Bangkok, Thailand",
+ publisher = "Association for Computational Linguistics",
+ url = "https://aclanthology.org/2024.acl-long.289",
+ doi = "10.18653/v1/2024.acl-long.289",
+ pages = "5294--5316",
+}""",
+ prompt={
+ "query": "Provide a specific decription of the image along with the following question."
+ },
+ descriptive_stats={
+ "n_samples": {"test": 5120},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 546.1925258591925,
+ "average_query_length": 59.580859375,
+ "num_documents": 5994,
+ "num_queries": 5120,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/METI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/METI2IRetrieval.py
new file mode 100644
index 0000000000..399c1fb792
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/METI2IRetrieval.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class METI2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="METI2IRetrieval",
+ description="Retrieve photos of more than 224k artworks.",
+ reference="https://arxiv.org/abs/2202.01747",
+ dataset={
+ "path": "JamieSJS/met",
+ "revision": "08ceaa61c0d172214abb3b8e82971d8f69d2aec0",
+ },
+ type="Any2AnyRetrieval",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cv_recall_at_1",
+ date=("2021-12-06", "2021-12-14"), # conference dates
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{ypsilantis2021met,
+ title={The met dataset: Instance-level recognition for artworks},
+ author={Ypsilantis, Nikolaos-Antonios and Garcia, Noa and Han, Guangxing and Ibrahimi, Sarah and Van Noord, Nanne and Tolias, Giorgos},
+ booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)},
+ year={2021}
+}
+ """,
+ descriptive_stats={
+ # "n_samples": {"default": 397121},
+ },
+ )
+ skip_first_result = True
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOI2TRetrieval.py
new file mode 100644
index 0000000000..bc4ce63c72
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOI2TRetrieval.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class MSCOCOI2TRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="MSCOCOI2TRetrieval",
+ description="Retrieve captions based on images.",
+ reference="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48",
+ dataset={
+ "path": "MRBench/mbeir_mscoco_task3",
+ "revision": "cca3a3e223763e6519a4d68936bc9279034d75d2",
+ "trust_remote_code": True,
+ },
+ type="Any2AnyRetrieval",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2018-01-01", "2018-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+ }""",
+ prompt={
+ "query": "Find an image caption describing the following everyday image."
+ },
+ descriptive_stats={
+ "n_samples": {"test": 5000},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 30.94235294117647,
+ "average_query_length": 131.56569965870307,
+ "num_documents": 24809,
+ "num_queries": 5000,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOT2IRetrieval.py
new file mode 100644
index 0000000000..4885e236c2
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOT2IRetrieval.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class MSCOCOT2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="MSCOCOT2IRetrieval",
+ description="Retrieve images based on captions.",
+ reference="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48",
+ dataset={
+ "path": "MRBench/mbeir_mscoco_task0",
+ "revision": "cfe15bd2791dde5f8f20aebecf0b4eb3812972d6",
+ "trust_remote_code": True,
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2018-01-01", "2018-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+ }""",
+ prompt={"query": "Identify the image showcasing the described everyday scene."},
+ descriptive_stats={
+ "n_samples": {"test": 24809},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 5000,
+ "num_queries": 24809,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py
new file mode 100644
index 0000000000..dfc42881df
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py
@@ -0,0 +1,144 @@
+from __future__ import annotations
+
+from datasets import concatenate_datasets, load_dataset
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = None):
+ corpus = {}
+ queries = {}
+ relevant_docs = {}
+
+ dataset = load_dataset(
+ path,
+ cache_dir=cache_dir,
+ revision=revision,
+ )
+ dataset_splits = list(dataset)
+
+ def map_function(split_name):
+ return lambda x, idx: {
+ "id": f"corpus-{split_name}-{idx}",
+ "text": x["text_corrected"] if x["text_corrected"] else "",
+ "modality": "text",
+ "image": None,
+ }
+
+ # Apply the map function to each split and concatenate
+ shared_corpus = concatenate_datasets(
+ [
+ dataset[split].map(
+ map_function(split),
+ with_indices=True,
+ remove_columns=[
+ "split",
+ "text_ocr",
+ "text_corrected",
+ "humor",
+ "sarcasm",
+ "offensive",
+ "motivational",
+ "sentiment",
+ ],
+ )
+ for split in dataset_splits
+ ]
+ )
+ # image corrupted & caption empty
+ shared_corpus = shared_corpus.select(
+ [i for i in range(len(shared_corpus)) if i not in [4578, 6781, 6784, 6786]]
+ )
+ for split in splits:
+ corpus[split] = shared_corpus
+ split_dataset = dataset[split]
+ queries[split] = split_dataset.map(
+ lambda x, idx: {
+ "id": f"query-{split}-{idx}",
+ "text": None,
+ "modality": "image",
+ # "image": None,
+ },
+ with_indices=True,
+ remove_columns=[
+ "split",
+ "text_ocr",
+ "humor",
+ "sarcasm",
+ "offensive",
+ "motivational",
+ "sentiment",
+ "text_corrected",
+ ],
+ )
+ if split == "test":
+ queries[split] = queries[split].select(
+ [i for i in range(len(queries[split])) if i not in [489, 492, 494]]
+ )
+ relevant_docs[split] = {}
+ for index in range(len(split_dataset)):
+ if index not in [489, 492, 494]:
+ query_id = f"query-{split}-{index}"
+ doc_id = f"corpus-{split}-{index}"
+ if query_id not in relevant_docs[split]:
+ relevant_docs[split][query_id] = {}
+ relevant_docs[split][query_id][doc_id] = 1
+ return corpus, queries, relevant_docs
+
+
+class MemotionI2TRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="MemotionI2TRetrieval",
+ description="Retrieve captions based on memes.",
+ reference="https://aclanthology.org/2020.semeval-1.99/",
+ dataset={
+ "path": "Ahren09/MMSoc_Memotion",
+ "revision": "cdb15b61d84d56db73e0e59535dfea81ea3c22f4",
+ # "trust_remote_code": True,
+ },
+ type="Any2AnyRetrieval",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2020-01-01", "2020-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@inproceedings{sharma2020semeval,
+ title={SemEval-2020 Task 8: Memotion Analysis-the Visuo-Lingual Metaphor!},
+ author={Sharma, Chhavi and Bhageria, Deepesh and Scott, William and Pykl, Srinivas and Das, Amitava and Chakraborty, Tanmoy and Pulabaigari, Viswanath and Gamb{\"a}ck, Bj{\"o}rn},
+ booktitle={Proceedings of the Fourteenth Workshop on Semantic Evaluation},
+ pages={759--773},
+ year={2020}
+}""",
+ descriptive_stats={
+ "n_samples": None,
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 83.80057388809182,
+ "average_query_length": 1.0,
+ "num_documents": 6988,
+ "num_queries": 697,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ if self.data_loaded:
+ return
+ self.corpus, self.queries, self.relevant_docs = _load_data(
+ path=self.metadata_dict["dataset"]["path"],
+ splits=self.metadata_dict["eval_splits"],
+ cache_dir=kwargs.get("cache_dir", None),
+ revision=self.metadata_dict["dataset"]["revision"],
+ )
+
+ self.data_loaded = True
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py
new file mode 100644
index 0000000000..dff7746b5a
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py
@@ -0,0 +1,143 @@
+from __future__ import annotations
+
+from datasets import concatenate_datasets, load_dataset
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = None):
+ corpus = {}
+ queries = {}
+ relevant_docs = {}
+
+ dataset = load_dataset(
+ path,
+ cache_dir=cache_dir,
+ revision=revision,
+ )
+ dataset_splits = list(dataset)
+
+ def map_function(split_name):
+ return lambda x, idx: {
+ "id": f"corpus-{split_name}-{idx}",
+ "text": None,
+ "modality": "image",
+ }
+
+ # Apply the map function to each split and concatenate
+ shared_corpus = concatenate_datasets(
+ [
+ dataset[split].map(
+ map_function(split),
+ with_indices=True,
+ remove_columns=[
+ "split",
+ "text_ocr",
+ "text_corrected",
+ "humor",
+ "sarcasm",
+ "offensive",
+ "motivational",
+ "sentiment",
+ ],
+ )
+ for split in dataset_splits
+ ]
+ )
+ # image corrupted
+ shared_corpus = shared_corpus.select(
+ [i for i in range(len(shared_corpus)) if i not in [4578, 6781, 6784, 6786]]
+ )
+ for split in splits:
+ corpus[split] = shared_corpus
+ split_dataset = dataset[split]
+ queries[split] = split_dataset.map(
+ lambda x, idx: {
+ "id": f"query-{split}-{idx}",
+ "text": x["text_corrected"],
+ "modality": "text",
+ "image": None,
+ },
+ with_indices=True,
+ remove_columns=[
+ "split",
+ "text_ocr",
+ "humor",
+ "sarcasm",
+ "offensive",
+ "motivational",
+ "sentiment",
+ "text_corrected",
+ ],
+ )
+ if split == "test":
+ queries[split] = queries[split].select(
+ [i for i in range(len(queries[split])) if i not in [489, 492, 494]]
+ )
+ relevant_docs[split] = {}
+ for index in range(len(split_dataset)):
+ if index not in [489, 492, 494]:
+ query_id = f"query-{split}-{index}"
+ doc_id = f"corpus-{split}-{index}"
+ if query_id not in relevant_docs[split]:
+ relevant_docs[split][query_id] = {}
+ relevant_docs[split][query_id][doc_id] = 1
+ return corpus, queries, relevant_docs
+
+
+class MemotionT2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="MemotionT2IRetrieval",
+ description="Retrieve memes based on captions.",
+ reference="https://aclanthology.org/2020.semeval-1.99/",
+ dataset={
+ "path": "Ahren09/MMSoc_Memotion",
+ "revision": "cdb15b61d84d56db73e0e59535dfea81ea3c22f4",
+ # "trust_remote_code": True,
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2020-01-01", "2020-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@inproceedings{sharma2020semeval,
+ title={SemEval-2020 Task 8: Memotion Analysis-the Visuo-Lingual Metaphor!},
+ author={Sharma, Chhavi and Bhageria, Deepesh and Scott, William and Pykl, Srinivas and Das, Amitava and Chakraborty, Tanmoy and Pulabaigari, Viswanath and Gamb{\"a}ck, Bj{\"o}rn},
+ booktitle={Proceedings of the Fourteenth Workshop on Semantic Evaluation},
+ pages={759--773},
+ year={2020}
+}""",
+ descriptive_stats={
+ "n_samples": None,
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 1.0,
+ "average_query_length": 83.80057388809182,
+ "num_documents": 6988,
+ "num_queries": 697,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ if self.data_loaded:
+ return
+ self.corpus, self.queries, self.relevant_docs = _load_data(
+ path=self.metadata_dict["dataset"]["path"],
+ splits=self.metadata_dict["eval_splits"],
+ cache_dir=kwargs.get("cache_dir", None),
+ revision=self.metadata_dict["dataset"]["revision"],
+ )
+
+ self.data_loaded = True
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/NIGHTSI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/NIGHTSI2IRetrieval.py
new file mode 100644
index 0000000000..aa05ac6494
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/NIGHTSI2IRetrieval.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class NIGHTSI2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="NIGHTSI2IRetrieval",
+ description="Retrieval identical image to the given image.",
+ reference="https://proceedings.neurips.cc/paper_files/paper/2023/hash/9f09f316a3eaf59d9ced5ffaefe97e0f-Abstract-Conference.html",
+ dataset={
+ "path": "MRBench/mbeir_nights_task4",
+ "revision": "c9583e052be7ad52d870c62a207a2e887ba9b8aa",
+ },
+ type="Any2AnyRetrieval",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2023-01-01", "2023-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Duplicate Image Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@article{fu2024dreamsim,
+ title={DreamSim: Learning New Dimensions of Human Visual Similarity using Synthetic Data},
+ author={Fu, Stephanie and Tamir, Netanel and Sundaram, Shobhita and Chai, Lucy and Zhang, Richard and Dekel, Tali and Isola, Phillip},
+ journal={Advances in Neural Information Processing Systems},
+ volume={36},
+ year={2024}
+}""",
+ prompt={
+ "query": "Find a day-to-day image that looks similar to the provided image."
+ },
+ descriptive_stats={
+ "n_samples": {"test": 2120},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 40038,
+ "num_queries": 2120,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/OKVQAIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/OKVQAIT2TRetrieval.py
new file mode 100644
index 0000000000..65b1c3b202
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/OKVQAIT2TRetrieval.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class OKVQAIT2TRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="OKVQAIT2TRetrieval",
+ description="Retrieval a Wiki passage to answer query about an image.",
+ reference="https://okvqa.allenai.org",
+ dataset={
+ "path": "izhx/UMRB-OKVQA",
+ "revision": "96a84a043f5465893670cf616f90e64086c0417a",
+ },
+ type="Any2AnyRetrieval",
+ category="it2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cv_recall_at_10",
+ date=("2019-01-01", "2020-07-29"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image", "text"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{marino2019ok,
+ title={Ok-vqa: A visual question answering benchmark requiring external knowledge},
+ author={Marino, Kenneth and Rastegari, Mohammad and Farhadi, Ali and Mottaghi, Roozbeh},
+ booktitle={Proceedings of the IEEE/cvf conference on computer vision and pattern recognition},
+ pages={3195--3204},
+ year={2019}
+}""",
+ prompt={
+ "query": "Retrieve documents that provide an answer to the question alongside the image."
+ },
+ descriptive_stats={
+ "n_samples": {"test": 5046},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 41.7072929052715,
+ "average_query_length": 631.7119703796849,
+ "num_documents": 114516,
+ "num_queries": 5046,
+ "average_relevant_docs_per_query": 7.426674593737614,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2ITRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2ITRetrieval.py
new file mode 100644
index 0000000000..c6d1ef6baa
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2ITRetrieval.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class OVENIT2ITRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="OVENIT2ITRetrieval",
+ description="Retrieval a Wiki image and passage to answer query about an image.",
+ reference="https://openaccess.thecvf.com/content/ICCV2023/html/Hu_Open-domain_Visual_Entity_Recognition_Towards_Recognizing_Millions_of_Wikipedia_Entities_ICCV_2023_paper.html",
+ dataset={
+ "path": "MRBench/mbeir_oven_task8",
+ "revision": "350d14b7258189654e26a2be93dc0bd6bee09b76",
+ },
+ type="Any2AnyRetrieval",
+ category="it2it",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2023-01-01", "2023-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image", "text"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{hu2023open,
+ title={Open-domain visual entity recognition: Towards recognizing millions of wikipedia entities},
+ author={Hu, Hexiang and Luan, Yi and Chen, Yang and Khandelwal, Urvashi and Joshi, Mandar and Lee, Kenton and Toutanova, Kristina and Chang, Ming-Wei},
+ booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+ pages={12065--12075},
+ year={2023}
+}""",
+ prompt={
+ "query": "Retrieve a Wikipedia image-description pair that provides evidence for the question of this image."
+ },
+ descriptive_stats={
+ "n_samples": {"test": 14741},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 335135,
+ "num_queries": 14741,
+ "average_relevant_docs_per_query": 17.7,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2TRetrieval.py
new file mode 100644
index 0000000000..94898f4819
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2TRetrieval.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class OVENIT2TRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="OVENIT2TRetrieval",
+ description="Retrieval a Wiki passage to answer query about an image.",
+ reference="https://openaccess.thecvf.com/content/ICCV2023/html/Hu_Open-domain_Visual_Entity_Recognition_Towards_Recognizing_Millions_of_Wikipedia_Entities_ICCV_2023_paper.html",
+ dataset={
+ "path": "MRBench/mbeir_oven_task6",
+ "revision": "2192074af29422bc1dc41cf07936f198b8c69bd0",
+ },
+ type="Any2AnyRetrieval",
+ category="it2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2023-01-01", "2023-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{hu2023open,
+ title={Open-domain visual entity recognition: Towards recognizing millions of wikipedia entities},
+ author={Hu, Hexiang and Luan, Yi and Chen, Yang and Khandelwal, Urvashi and Joshi, Mandar and Lee, Kenton and Toutanova, Kristina and Chang, Ming-Wei},
+ booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+ pages={12065--12075},
+ year={2023}
+}""",
+ prompt={
+ "query": "Retrieve a Wikipedia paragraph that provides an answer to the given query about the image."
+ },
+ descriptive_stats={
+ "n_samples": {"test": 50004},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 676667,
+ "num_queries": 50004,
+ "average_relevant_docs_per_query": 9.9,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/ROxfordI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/ROxfordI2IRetrieval.py
new file mode 100644
index 0000000000..dbec8e6ae7
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/ROxfordI2IRetrieval.py
@@ -0,0 +1,142 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class ROxfordEasyI2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="ROxfordEasyI2IRetrieval",
+ description="Retrieve photos of landmarks in Oxford, UK.",
+ reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html",
+ dataset={
+ "path": "JamieSJS/r-oxford-easy",
+ "revision": "b71b5f67a93aa63761b79a67bcf28bd2ae590902",
+ },
+ type="Any2AnyRetrieval",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cv_recall_at_1",
+ date=("2009-01-01", "2010-04-01"),
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{radenovic2018revisiting,
+ title={Revisiting oxford and paris: Large-scale image retrieval benchmarking},
+ author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5706--5715},
+ year={2018}
+}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 70},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 516,
+ "num_queries": 70,
+ "average_relevant_docs_per_query": 43.3,
+ }
+ },
+ },
+ )
+ skip_first_result = False
+
+
+class ROxfordMediumI2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="ROxfordMediumI2IRetrieval",
+ description="Retrieve photos of landmarks in Oxford, UK.",
+ reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html",
+ dataset={
+ "path": "JamieSJS/r-oxford-medium",
+ "revision": "1dfb86730ee4b3f49b441f4896d473c83eb5ff0d",
+ },
+ type="Any2AnyRetrieval",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cv_recall_at_1",
+ date=("2009-01-01", "2010-04-01"),
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{radenovic2018revisiting,
+ title={Revisiting oxford and paris: Large-scale image retrieval benchmarking},
+ author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5706--5715},
+ year={2018}
+}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 70},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 788,
+ "num_queries": 70,
+ "average_relevant_docs_per_query": 78.9,
+ }
+ },
+ },
+ )
+ skip_first_result = False
+
+
+class ROxfordHardI2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="ROxfordHardI2IRetrieval",
+ description="Retrieve photos of landmarks in Oxford, UK.",
+ reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html",
+ dataset={
+ "path": "JamieSJS/r-oxford-hard",
+ "revision": "f71ab9d4aabcda93d55a7e65edfb3a34767d89e6",
+ },
+ type="Any2AnyRetrieval",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cv_recall_at_1",
+ date=("2009-01-01", "2010-04-01"),
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{radenovic2018revisiting,
+ title={Revisiting oxford and paris: Large-scale image retrieval benchmarking},
+ author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5706--5715},
+ year={2018}
+}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 70},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 685,
+ "num_queries": 70,
+ "average_relevant_docs_per_query": 35.7,
+ }
+ },
+ },
+ )
+ skip_first_result = False
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/RP2kI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/RP2kI2IRetrieval.py
new file mode 100644
index 0000000000..5d08f1ef91
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/RP2kI2IRetrieval.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class RP2kI2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="RP2kI2IRetrieval",
+ description="Retrieve photos of 39457 products.",
+ reference="https://arxiv.org/abs/2006.12634",
+ dataset={
+ "path": "JamieSJS/rp2k",
+ "revision": "f8f82d4eb1aa4dc4dbf2c768596c8110a3703765",
+ },
+ type="Any2AnyRetrieval",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cv_recall_at_1",
+ date=("2009-01-01", "2010-04-01"),
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@article{peng2020rp2k,
+ title={RP2K: A large-scale retail product dataset for fine-grained image classification},
+ author={Peng, Jingtian and Xiao, Chang and Li, Yifan},
+ journal={arXiv preprint arXiv:2006.12634},
+ year={2020}
+}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 39457},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 39457,
+ "num_queries": 39457,
+ "average_relevant_docs_per_query": 111.8,
+ }
+ },
+ },
+ )
+ skip_first_result = True
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/RParisI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/RParisI2IRetrieval.py
new file mode 100644
index 0000000000..8c2f6344fb
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/RParisI2IRetrieval.py
@@ -0,0 +1,142 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class RParisEasyI2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="RParisEasyI2IRetrieval",
+ description="Retrieve photos of landmarks in Paris, France.",
+ reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html",
+ dataset={
+ "path": "JamieSJS/r-paris-easy",
+ "revision": "7d821ddebcb30ad343133e3a81e23347ac2a08a8",
+ },
+ type="Any2AnyRetrieval",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cv_recall_at_1",
+ date=("2009-01-01", "2010-04-01"),
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{radenovic2018revisiting,
+ title={Revisiting oxford and paris: Large-scale image retrieval benchmarking},
+ author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5706--5715},
+ year={2018}
+}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 70},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 1470,
+ "num_queries": 70,
+ "average_relevant_docs_per_query": 98.2,
+ }
+ },
+ },
+ )
+ skip_first_result = False
+
+
+class RParisMediumI2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="RParisMediumI2IRetrieval",
+ description="Retrieve photos of landmarks in Paris, France.",
+ reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html",
+ dataset={
+ "path": "JamieSJS/r-paris-medium",
+ "revision": "3d959815e102785efd628170281f1e65561b03d2",
+ },
+ type="Any2AnyRetrieval",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cv_recall_at_1",
+ date=("2009-01-01", "2010-04-01"),
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{radenovic2018revisiting,
+ title={Revisiting oxford and paris: Large-scale image retrieval benchmarking},
+ author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5706--5715},
+ year={2018}
+}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 70},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 2651,
+ "num_queries": 70,
+ "average_relevant_docs_per_query": 147.9,
+ }
+ },
+ },
+ )
+ skip_first_result = False
+
+
+class RParisHardI2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="RParisHardI2IRetrieval",
+ description="Retrieve photos of landmarks in Paris, France.",
+ reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html",
+ dataset={
+ "path": "JamieSJS/r-paris-hard",
+ "revision": "d3e0adf4e942446c04427511ccce281c86861248",
+ },
+ type="Any2AnyRetrieval",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cv_recall_at_1",
+ date=("2009-01-01", "2010-04-01"),
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{radenovic2018revisiting,
+ title={Revisiting oxford and paris: Large-scale image retrieval benchmarking},
+ author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5706--5715},
+ year={2018}
+}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 70},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 6322,
+ "num_queries": 70,
+ "average_relevant_docs_per_query": 35.7,
+ }
+ },
+ },
+ )
+ skip_first_result = False
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/ReMuQIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/ReMuQIT2TRetrieval.py
new file mode 100644
index 0000000000..648d2d2e44
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/ReMuQIT2TRetrieval.py
@@ -0,0 +1,63 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class ReMuQIT2TRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="ReMuQIT2TRetrieval",
+ description="Retrieval a Wiki passage to answer query about an image.",
+ reference="https://github.com/luomancs/ReMuQ",
+ dataset={
+ "path": "izhx/UMRB-ReMuQ",
+ "revision": "f0bd5955d2897bd1bed56546e88082d966c90a80",
+ },
+ type="Any2AnyRetrieval",
+ category="it2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cv_recall_at_5",
+ date=("2023-05-15", "2023-07-09"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc0-1.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image", "text"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{luo-etal-2023-end,
+ title = "End-to-end Knowledge Retrieval with Multi-modal Queries",
+ author = "Luo, Man and
+ Fang, Zhiyuan and
+ Gokhale, Tejas and
+ Yang, Yezhou and
+ Baral, Chitta",
+ editor = "Rogers, Anna and
+ Boyd-Graber, Jordan and
+ Okazaki, Naoaki",
+ booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
+ month = jul,
+ year = "2023",
+ address = "Toronto, Canada",
+ publisher = "Association for Computational Linguistics",
+ url = "https://aclanthology.org/2023.acl-long.478",
+ doi = "10.18653/v1/2023.acl-long.478",
+ pages = "8573--8589",
+}""",
+ prompt={
+ "query": "Retrieve a fact-based paragraph that provides an answer to the given query about the image."
+ },
+ descriptive_stats={
+ "n_samples": {"test": 3609},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 208.18675158868538,
+ "average_query_length": 73.85508451094486,
+ "num_documents": 138794,
+ "num_queries": 3609,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/SOPI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/SOPI2IRetrieval.py
new file mode 100644
index 0000000000..0558f0ce26
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/SOPI2IRetrieval.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class SOPI2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="SOPI2IRetrieval",
+ description="Retrieve product photos of 22634 online products.",
+ reference="https://paperswithcode.com/dataset/stanford-online-products",
+ dataset={
+ "path": "JamieSJS/stanford-online-products",
+ "revision": "0b3a1622902e6258425e673405bdfb1e5dfa8618",
+ },
+ type="Any2AnyRetrieval",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cv_recall_at_1",
+ date=("2019-07-17", "2019-07-17"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{oh2016deep,
+ title={Deep metric learning via lifted structured feature embedding},
+ author={Oh Song, Hyun and Xiang, Yu and Jegelka, Stefanie and Savarese, Silvio},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={4004--4012},
+ year={2016}
+}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 120053},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 120053,
+ "num_queries": 120053,
+ "average_relevant_docs_per_query": 7,
+ }
+ },
+ },
+ )
+ skip_first_result = True
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRI2TRetrieval.py
new file mode 100644
index 0000000000..a8aac928c4
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRI2TRetrieval.py
@@ -0,0 +1,119 @@
+from __future__ import annotations
+
+from datasets import load_dataset
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = None):
+ corpus = {}
+ queries = {}
+ relevant_docs = {}
+
+ dataset = load_dataset(
+ path,
+ cache_dir=cache_dir,
+ revision=revision,
+ )
+
+ for split in splits:
+ split_dataset = dataset[split]
+
+ corpus[split] = split_dataset.map(
+ lambda x, idx: {
+ "id": f"corpus-{split}-{idx}",
+ # "text": None,
+ "modality": "text",
+ "image": None,
+ },
+ with_indices=True,
+ remove_columns=[
+ "file_name_index",
+ "class",
+ "super_class",
+ "sub_class",
+ "split",
+ ],
+ )
+
+ queries[split] = split_dataset.map(
+ lambda x, idx: {
+ "id": f"query-{split}-{idx}",
+ "text": None,
+ "modality": "image",
+ # "image": None,
+ },
+ with_indices=True,
+ remove_columns=[
+ "file_name_index",
+ "class",
+ "super_class",
+ "sub_class",
+ "split",
+ ],
+ )
+ relevant_docs[split] = {}
+ for index in range(len(split_dataset)):
+ query_id = f"query-{split}-{index}"
+ doc_id = f"corpus-{split}-{index}"
+ if query_id not in relevant_docs[split]:
+ relevant_docs[split][query_id] = {}
+ relevant_docs[split][query_id][doc_id] = 1
+ return corpus, queries, relevant_docs
+
+
+class SciMMIRI2TRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="SciMMIRI2TRetrieval",
+ description="Retrieve captions based on figures and tables.",
+ reference="https://aclanthology.org/2024.findings-acl.746/",
+ dataset={
+ "path": "m-a-p/SciMMIR",
+ "revision": "eea276dc58c52eab33e9476acb137ff5530b78e9",
+ # "trust_remote_code": True,
+ },
+ type="Any2AnyRetrieval",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2023-01-01", "2023-12-31"),
+ domains=["Academic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{wu2024scimmir,
+ title={SciMMIR: Benchmarking Scientific Multi-modal Information Retrieval},
+ author={Wu, Siwei and Li, Yizhi and Zhu, Kang and Zhang, Ge and Liang, Yiming and Ma, Kaijing and Xiao, Chenghao and Zhang, Haoran and Yang, Bohao and Chen, Wenhu and others},
+ journal={arXiv preprint arXiv:2401.13478},
+ year={2024}
+}""",
+ descriptive_stats={
+ "n_samples": None,
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 261.1932607759946,
+ "average_query_length": 1.0,
+ "num_documents": 16263,
+ "num_queries": 16263,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ if self.data_loaded:
+ return
+ self.corpus, self.queries, self.relevant_docs = _load_data(
+ path=self.metadata_dict["dataset"]["path"],
+ splits=self.metadata_dict["eval_splits"],
+ cache_dir=kwargs.get("cache_dir", None),
+ revision=self.metadata_dict["dataset"]["revision"],
+ )
+
+ self.data_loaded = True
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRT2IRetrieval.py
new file mode 100644
index 0000000000..41fa6aebc1
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRT2IRetrieval.py
@@ -0,0 +1,119 @@
+from __future__ import annotations
+
+from datasets import load_dataset
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = None):
+ corpus = {}
+ queries = {}
+ relevant_docs = {}
+
+ dataset = load_dataset(
+ path,
+ cache_dir=cache_dir,
+ revision=revision,
+ )
+
+ for split in splits:
+ split_dataset = dataset[split]
+
+ corpus[split] = split_dataset.map(
+ lambda x, idx: {
+ "id": f"corpus-{split}-{idx}",
+ "text": None,
+ "modality": "image",
+ # "image": None,
+ },
+ with_indices=True,
+ remove_columns=[
+ "file_name_index",
+ "class",
+ "super_class",
+ "sub_class",
+ "split",
+ ],
+ )
+
+ queries[split] = split_dataset.map(
+ lambda x, idx: {
+ "id": f"query-{split}-{idx}",
+ # "text": None,
+ "modality": "text",
+ "image": None,
+ },
+ with_indices=True,
+ remove_columns=[
+ "file_name_index",
+ "class",
+ "super_class",
+ "sub_class",
+ "split",
+ ],
+ )
+ relevant_docs[split] = {}
+ for index in range(len(split_dataset)):
+ query_id = f"query-{split}-{index}"
+ doc_id = f"corpus-{split}-{index}"
+ if query_id not in relevant_docs[split]:
+ relevant_docs[split][query_id] = {}
+ relevant_docs[split][query_id][doc_id] = 1
+ return corpus, queries, relevant_docs
+
+
+class SciMMIRT2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="SciMMIRT2IRetrieval",
+ description="Retrieve figures and tables based on captions.",
+ reference="https://aclanthology.org/2024.findings-acl.746/",
+ dataset={
+ "path": "m-a-p/SciMMIR",
+ "revision": "eea276dc58c52eab33e9476acb137ff5530b78e9",
+ # "trust_remote_code": True,
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2023-01-01", "2023-12-31"),
+ domains=["Academic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{wu2024scimmir,
+ title={SciMMIR: Benchmarking Scientific Multi-modal Information Retrieval},
+ author={Wu, Siwei and Li, Yizhi and Zhu, Kang and Zhang, Ge and Liang, Yiming and Ma, Kaijing and Xiao, Chenghao and Zhang, Haoran and Yang, Bohao and Chen, Wenhu and others},
+ journal={arXiv preprint arXiv:2401.13478},
+ year={2024}
+}""",
+ descriptive_stats={
+ "n_samples": None,
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 1.0,
+ "average_query_length": 261.1932607759946,
+ "num_documents": 16263,
+ "num_queries": 16263,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ if self.data_loaded:
+ return
+ self.corpus, self.queries, self.relevant_docs = _load_data(
+ path=self.metadata_dict["dataset"]["path"],
+ splits=self.metadata_dict["eval_splits"],
+ cache_dir=kwargs.get("cache_dir", None),
+ revision=self.metadata_dict["dataset"]["revision"],
+ )
+
+ self.data_loaded = True
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/SketchyI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/SketchyI2IRetrieval.py
new file mode 100644
index 0000000000..002e1a39e2
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/SketchyI2IRetrieval.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class SketchyI2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="SketchyI2IRetrieval",
+ description="Retrieve photos from sketches.",
+ reference="https://arxiv.org/abs/2202.01747",
+ dataset={
+ "path": "JamieSJS/sketchy",
+ "revision": "c8b8c1b7a2f0a92f1bfaaa1c9afc22aa42c61d5b",
+ },
+ type="Any2AnyRetrieval",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cv_recall_at_1",
+ date=("2021-12-06", "2021-12-14"), # conference dates
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{ypsilantis2021met,
+ title={The met dataset: Instance-level recognition for artworks},
+ author={Ypsilantis, Nikolaos-Antonios and Garcia, Noa and Han, Guangxing and Ibrahimi, Sarah and Van Noord, Nanne and Tolias, Giorgos},
+ booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)},
+ year={2021}
+}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 452886},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 7.24,
+ "num_documents": 25000,
+ "num_queries": 452886,
+ "average_relevant_docs_per_query": 3623.0,
+ }
+ },
+ },
+ )
+ skip_first_result = False
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/StanfordCarsI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/StanfordCarsI2IRetrieval.py
new file mode 100644
index 0000000000..e8d267eeaa
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/StanfordCarsI2IRetrieval.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class StanfordCarsI2I(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="StanfordCarsI2IRetrieval",
+ description="Retrieve car images from 196 makes.",
+ reference="https://pure.mpg.de/rest/items/item_2029263/component/file_2029262/content",
+ dataset={
+ "path": "isaacchung/stanford_cars_retrieval",
+ "revision": "b27a0612211af3598bd11fe28af20928f20cce06",
+ },
+ type="Any2AnyRetrieval",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cv_recall_at_1",
+ date=("2012-01-01", "2013-04-01"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{Krause2013CollectingAL,
+ title={Collecting a Large-scale Dataset of Fine-grained Cars},
+ author={Jonathan Krause and Jia Deng and Michael Stark and Li Fei-Fei},
+ year={2013},
+ url={https://api.semanticscholar.org/CorpusID:16632981}
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"default": 8041},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 1074.894348894349,
+ "average_query_length": 77.06142506142506,
+ "num_documents": 8041,
+ "num_queries": 8041,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
+ skip_first_result = True
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/TUBerlinT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/TUBerlinT2IRetrieval.py
new file mode 100644
index 0000000000..b85cd1f94b
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/TUBerlinT2IRetrieval.py
@@ -0,0 +1,52 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class TUBerlinT2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="TUBerlinT2IRetrieval",
+ description="Retrieve sketch images based on text descriptions.",
+ reference="https://dl.acm.org/doi/pdf/10.1145/2185520.2185540?casa_token=tq-eUx5UROYAAAAA:_694nPzE7tali6LCkxQc0M-mlo9xslasPMcVnFPMy9tDfvt7lg7p1RTe-k8VWCjuv9gmkQqasKUZ",
+ dataset={
+ "path": "gowitheflow/tu-berlin",
+ "revision": "0cd78cd1ddbd3cafa9f319c638ebd77836ec9ff6",
+ # "trust_remote_code": True,
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2012-01-01", "2012-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{eitz2012humans,
+ title={How do humans sketch objects?},
+ author={Eitz, Mathias and Hays, James and Alexa, Marc},
+ journal={ACM Transactions on graphics (TOG)},
+ volume={31},
+ number={4},
+ pages={1--10},
+ year={2012},
+ publisher={Acm New York, NY, USA}
+}""",
+ descriptive_stats={
+ "n_samples": {"test": 250},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 7.24,
+ "num_documents": 20000,
+ "num_queries": 250,
+ "average_relevant_docs_per_query": 80.0,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/VQA2IT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/VQA2IT2TRetrieval.py
new file mode 100644
index 0000000000..39f07cf945
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/VQA2IT2TRetrieval.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class VQA2IT2TRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="VQA2IT2TRetrieval",
+ description="Retrieve the correct answer for a question about an image.",
+ reference="https://openaccess.thecvf.com/content_cvpr_2017/html/Goyal_Making_the_v_CVPR_2017_paper.html",
+ dataset={
+ "path": "JamieSJS/vqa-2",
+ "revision": "69882b6ba0b443dd62e633e546725b0f13b7e3aa",
+ "trust_remote_code": True,
+ },
+ type="Any2AnyRetrieval",
+ category="it2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2017-07-01", "2017-07-01"),
+ domains=["Web"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@InProceedings{Goyal_2017_CVPR,
+author = {Goyal, Yash and Khot, Tejas and Summers-Stay, Douglas and Batra, Dhruv and Parikh, Devi},
+title = {Making the v in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering},
+booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+month = {July},
+year = {2017}
+}
+""",
+ descriptive_stats={
+ "n_samples": {"test": 4319},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 2091,
+ "num_queries": 4319,
+ "average_relevant_docs_per_query": 1,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/VidoreBenchRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/VidoreBenchRetrieval.py
new file mode 100644
index 0000000000..44d0d36cb0
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/VidoreBenchRetrieval.py
@@ -0,0 +1,610 @@
+from __future__ import annotations
+
+from datasets import load_dataset
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+def _load_data(
+ path: str,
+ splits: str,
+ cache_dir: str | None = None,
+ revision: str | None = None,
+):
+ corpus = {}
+ queries = {}
+ relevant_docs = {}
+
+ for split in splits:
+ query_ds = load_dataset(
+ path,
+ "queries",
+ split=split,
+ cache_dir=cache_dir,
+ revision=revision,
+ )
+ query_ds = query_ds.map(
+ lambda x: {
+ "id": f"query-{split}-{x['query-id']}",
+ "text": x["query"],
+ "image": None,
+ "modality": "text",
+ },
+ remove_columns=["query-id", "query"],
+ )
+ queries[split] = query_ds
+
+ corpus_ds = load_dataset(
+ path,
+ "corpus",
+ split=split,
+ cache_dir=cache_dir,
+ revision=revision,
+ )
+ corpus_ds = corpus_ds.map(
+ lambda x: {
+ "id": f"corpus-{split}-{x['corpus-id']}",
+ "text": None,
+ "modality": "image",
+ },
+ remove_columns=["corpus-id"],
+ )
+ corpus[split] = corpus_ds
+
+ qrels_ds = load_dataset(
+ path,
+ "qrels",
+ split=split,
+ cache_dir=cache_dir,
+ revision=revision,
+ )
+ relevant_docs[split] = {}
+ for row in qrels_ds:
+ qid = f"query-{split}-{row['query-id']}"
+ did = f"corpus-{split}-{row['corpus-id']}"
+ if qid not in relevant_docs[split]:
+ relevant_docs[split][qid] = {}
+ relevant_docs[split][qid][did] = int(row["score"])
+
+ return corpus, queries, relevant_docs
+
+
+class VidoreArxivQARetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="VidoreArxivQARetrieval",
+ description="Retrieve associated pages according to questions.",
+ reference="https://arxiv.org/pdf/2407.01449",
+ dataset={
+ "path": "vidore/arxivqa_test_subsampled_beir",
+ "revision": "7d94d570960eac2408d3baa7a33f9de4822ae3e4",
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_5",
+ date=("2024-01-01", "2024-07-01"),
+ domains=["Academic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{faysse2024colpali,
+ title={ColPali: Efficient Document Retrieval with Vision Language Models},
+ author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre},
+ journal={arXiv preprint arXiv:2407.01449},
+ year={2024}
+}""",
+ prompt={"query": "Find a screenshot that relevant to the user's question."},
+ descriptive_stats={
+ "n_samples": None,
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 1.0,
+ "average_query_length": 99.328,
+ "num_documents": 500,
+ "num_queries": 500,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ self.corpus, self.queries, self.relevant_docs = _load_data(
+ path=self.metadata_dict["dataset"]["path"],
+ splits=self.metadata_dict["eval_splits"],
+ cache_dir=kwargs.get("cache_dir", None),
+ revision=self.metadata_dict["dataset"]["revision"],
+ )
+
+ self.data_loaded = True
+
+
+class VidoreDocVQARetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="VidoreDocVQARetrieval",
+ description="Retrieve associated pages according to questions.",
+ reference="https://arxiv.org/pdf/2407.01449",
+ dataset={
+ "path": "vidore/docvqa_test_subsampled_beir",
+ "revision": "162ba2fc1a8437eda8b6c37b240bc1c0f0deb092",
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_5",
+ date=("2024-01-01", "2024-07-01"),
+ domains=["Academic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{faysse2024colpali,
+ title={ColPali: Efficient Document Retrieval with Vision Language Models},
+ author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre},
+ journal={arXiv preprint arXiv:2407.01449},
+ year={2024}
+}""",
+ prompt={"query": "Find a screenshot that relevant to the user's question."},
+ descriptive_stats={
+ "n_samples": None,
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 1.0,
+ "average_query_length": 41.746,
+ "num_documents": 500,
+ "num_queries": 500,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ self.corpus, self.queries, self.relevant_docs = _load_data(
+ path=self.metadata_dict["dataset"]["path"],
+ splits=self.metadata_dict["eval_splits"],
+ cache_dir=kwargs.get("cache_dir", None),
+ revision=self.metadata_dict["dataset"]["revision"],
+ )
+
+ self.data_loaded = True
+
+
+class VidoreInfoVQARetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="VidoreInfoVQARetrieval",
+ description="Retrieve associated pages according to questions.",
+ reference="https://arxiv.org/pdf/2407.01449",
+ dataset={
+ "path": "vidore/infovqa_test_subsampled_beir",
+ "revision": "b802cc5fd6c605df2d673a963667d74881d2c9a4",
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_5",
+ date=("2024-01-01", "2024-07-01"),
+ domains=["Academic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{faysse2024colpali,
+ title={ColPali: Efficient Document Retrieval with Vision Language Models},
+ author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre},
+ journal={arXiv preprint arXiv:2407.01449},
+ year={2024}
+}""",
+ prompt={"query": "Find a screenshot that relevant to the user's question."},
+ descriptive_stats={
+ "n_samples": None,
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 1.0,
+ "average_query_length": 64.934,
+ "num_documents": 500,
+ "num_queries": 500,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ self.corpus, self.queries, self.relevant_docs = _load_data(
+ path=self.metadata_dict["dataset"]["path"],
+ splits=self.metadata_dict["eval_splits"],
+ cache_dir=kwargs.get("cache_dir", None),
+ revision=self.metadata_dict["dataset"]["revision"],
+ )
+
+ self.data_loaded = True
+
+
+class VidoreTabfquadRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="VidoreTabfquadRetrieval",
+ description="Retrieve associated pages according to questions.",
+ reference="https://arxiv.org/pdf/2407.01449",
+ dataset={
+ "path": "vidore/tabfquad_test_subsampled_beir",
+ "revision": "61a2224bcd29b7b261a4892ff4c8bea353527a31",
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_5",
+ date=("2024-01-01", "2024-07-01"),
+ domains=["Academic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{faysse2024colpali,
+ title={ColPali: Efficient Document Retrieval with Vision Language Models},
+ author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre},
+ journal={arXiv preprint arXiv:2407.01449},
+ year={2024}
+}""",
+ prompt={"query": "Find a screenshot that relevant to the user's question."},
+ descriptive_stats={
+ "n_samples": None,
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 1.0,
+ "average_query_length": 100.63214285714285,
+ "num_documents": 70,
+ "num_queries": 280,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ self.corpus, self.queries, self.relevant_docs = _load_data(
+ path=self.metadata_dict["dataset"]["path"],
+ splits=self.metadata_dict["eval_splits"],
+ cache_dir=kwargs.get("cache_dir", None),
+ revision=self.metadata_dict["dataset"]["revision"],
+ )
+
+ self.data_loaded = True
+
+
+class VidoreTatdqaRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="VidoreTatdqaRetrieval",
+ description="Retrieve associated pages according to questions.",
+ reference="https://arxiv.org/pdf/2407.01449",
+ dataset={
+ "path": "vidore/tatdqa_test_beir",
+ "revision": "5feb5630fdff4d8d189ffedb2dba56862fdd45c0",
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_5",
+ date=("2024-01-01", "2024-07-01"),
+ domains=["Academic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{faysse2024colpali,
+ title={ColPali: Efficient Document Retrieval with Vision Language Models},
+ author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre},
+ journal={arXiv preprint arXiv:2407.01449},
+ year={2024}
+}""",
+ prompt={"query": "Find a screenshot that relevant to the user's question."},
+ descriptive_stats={
+ "n_samples": None,
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 1.0,
+ "average_query_length": 72.76368009621167,
+ "num_documents": 277,
+ "num_queries": 1663,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ self.corpus, self.queries, self.relevant_docs = _load_data(
+ path=self.metadata_dict["dataset"]["path"],
+ splits=self.metadata_dict["eval_splits"],
+ cache_dir=kwargs.get("cache_dir", None),
+ revision=self.metadata_dict["dataset"]["revision"],
+ )
+
+ self.data_loaded = True
+
+
+class VidoreShiftProjectRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="VidoreShiftProjectRetrieval",
+ description="Retrieve associated pages according to questions.",
+ reference="https://arxiv.org/pdf/2407.01449",
+ dataset={
+ "path": "vidore/shiftproject_test_beir",
+ "revision": "84a382e05c4473fed9cff2bbae95fe2379416117",
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_5",
+ date=("2024-01-01", "2024-07-01"),
+ domains=["Academic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{faysse2024colpali,
+ title={ColPali: Efficient Document Retrieval with Vision Language Models},
+ author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre},
+ journal={arXiv preprint arXiv:2407.01449},
+ year={2024}
+}""",
+ prompt={"query": "Find a screenshot that relevant to the user's question."},
+ descriptive_stats={
+ "n_samples": None,
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 1.0,
+ "average_query_length": 97.7,
+ "num_documents": 1000,
+ "num_queries": 100,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ self.corpus, self.queries, self.relevant_docs = _load_data(
+ path=self.metadata_dict["dataset"]["path"],
+ splits=self.metadata_dict["eval_splits"],
+ cache_dir=kwargs.get("cache_dir", None),
+ revision=self.metadata_dict["dataset"]["revision"],
+ )
+
+ self.data_loaded = True
+
+
+class VidoreSyntheticDocQAAIRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="VidoreSyntheticDocQAAIRetrieval",
+ description="Retrieve associated pages according to questions.",
+ reference="https://arxiv.org/pdf/2407.01449",
+ dataset={
+ "path": "vidore/syntheticDocQA_artificial_intelligence_test_beir",
+ "revision": "2d9ebea5a1c6e9ef4a3b902a612f605dca11261c",
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_5",
+ date=("2024-01-01", "2024-07-01"),
+ domains=["Academic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{faysse2024colpali,
+ title={ColPali: Efficient Document Retrieval with Vision Language Models},
+ author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre},
+ journal={arXiv preprint arXiv:2407.01449},
+ year={2024}
+}""",
+ prompt={"query": "Find a screenshot that relevant to the user's question."},
+ descriptive_stats={
+ "n_samples": None,
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 1.0,
+ "average_query_length": 77.71,
+ "num_documents": 968,
+ "num_queries": 100,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ self.corpus, self.queries, self.relevant_docs = _load_data(
+ path=self.metadata_dict["dataset"]["path"],
+ splits=self.metadata_dict["eval_splits"],
+ cache_dir=kwargs.get("cache_dir", None),
+ revision=self.metadata_dict["dataset"]["revision"],
+ )
+
+ self.data_loaded = True
+
+
+class VidoreSyntheticDocQAEnergyRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="VidoreSyntheticDocQAEnergyRetrieval",
+ description="Retrieve associated pages according to questions.",
+ reference="https://arxiv.org/pdf/2407.01449",
+ dataset={
+ "path": "vidore/syntheticDocQA_energy_test_beir",
+ "revision": "9935aadbad5c8deec30910489db1b2c7133ae7a7",
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_5",
+ date=("2024-01-01", "2024-07-01"),
+ domains=["Academic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{faysse2024colpali,
+ title={ColPali: Efficient Document Retrieval with Vision Language Models},
+ author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre},
+ journal={arXiv preprint arXiv:2407.01449},
+ year={2024}
+}""",
+ prompt={"query": "Find a screenshot that relevant to the user's question."},
+ descriptive_stats={
+ "n_samples": None,
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 1.0,
+ "average_query_length": 83.69,
+ "num_documents": 977,
+ "num_queries": 100,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ self.corpus, self.queries, self.relevant_docs = _load_data(
+ path=self.metadata_dict["dataset"]["path"],
+ splits=self.metadata_dict["eval_splits"],
+ cache_dir=kwargs.get("cache_dir", None),
+ revision=self.metadata_dict["dataset"]["revision"],
+ )
+
+ self.data_loaded = True
+
+
+class VidoreSyntheticDocQAGovernmentReportsRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="VidoreSyntheticDocQAGovernmentReportsRetrieval",
+ description="Retrieve associated pages according to questions.",
+ reference="https://arxiv.org/pdf/2407.01449",
+ dataset={
+ "path": "vidore/syntheticDocQA_government_reports_test_beir",
+ "revision": "b4909afa930f81282fd20601e860668073ad02aa",
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_5",
+ date=("2024-01-01", "2024-07-01"),
+ domains=["Academic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{faysse2024colpali,
+ title={ColPali: Efficient Document Retrieval with Vision Language Models},
+ author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre},
+ journal={arXiv preprint arXiv:2407.01449},
+ year={2024}
+}""",
+ prompt={"query": "Find a screenshot that relevant to the user's question."},
+ descriptive_stats={
+ "n_samples": None,
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 1.0,
+ "average_query_length": 82.53,
+ "num_documents": 972,
+ "num_queries": 100,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ self.corpus, self.queries, self.relevant_docs = _load_data(
+ path=self.metadata_dict["dataset"]["path"],
+ splits=self.metadata_dict["eval_splits"],
+ cache_dir=kwargs.get("cache_dir", None),
+ revision=self.metadata_dict["dataset"]["revision"],
+ )
+
+ self.data_loaded = True
+
+
+class VidoreSyntheticDocQAHealthcareIndustryRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="VidoreSyntheticDocQAHealthcareIndustryRetrieval",
+ description="Retrieve associated pages according to questions.",
+ reference="https://arxiv.org/pdf/2407.01449",
+ dataset={
+ "path": "vidore/syntheticDocQA_healthcare_industry_test_beir",
+ "revision": "f9e25d5b6e13e1ad9f5c3cce202565031b3ab164",
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_5",
+ date=("2024-01-01", "2024-07-01"),
+ domains=["Academic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{faysse2024colpali,
+ title={ColPali: Efficient Document Retrieval with Vision Language Models},
+ author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre},
+ journal={arXiv preprint arXiv:2407.01449},
+ year={2024}
+}""",
+ prompt={"query": "Find a screenshot that relevant to the user's question."},
+ descriptive_stats={
+ "n_samples": None,
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 1.0,
+ "average_query_length": 80.43,
+ "num_documents": 965,
+ "num_queries": 100,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ self.corpus, self.queries, self.relevant_docs = _load_data(
+ path=self.metadata_dict["dataset"]["path"],
+ splits=self.metadata_dict["eval_splits"],
+ cache_dir=kwargs.get("cache_dir", None),
+ revision=self.metadata_dict["dataset"]["revision"],
+ )
+
+ self.data_loaded = True
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsI2TRetrieval.py
new file mode 100644
index 0000000000..2f79bfe9eb
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsI2TRetrieval.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class VisualNewsI2TRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="VisualNewsI2TRetrieval",
+ description="Retrieval entity-rich captions for news images.",
+ reference="https://aclanthology.org/2021.emnlp-main.542/",
+ dataset={
+ "path": "MRBench/mbeir_visualnews_task3",
+ "revision": "aaee58895a66e4d619168849267ed2bb40d37043",
+ },
+ type="Any2AnyRetrieval",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2020-01-01", "2020-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image", "text"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{liu2021visual,
+ title={Visual News: Benchmark and Challenges in News Image Captioning},
+ author={Liu, Fuxiao and Wang, Yinghan and Wang, Tianlu and Ordonez, Vicente},
+ booktitle={Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
+ pages={6761--6771},
+ year={2021}
+}""",
+ prompt={"query": "Find a caption for the news in the given photo."},
+ descriptive_stats={
+ "n_samples": {"test": 20000},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 537568,
+ "num_queries": 20000,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsT2IRetrieval.py
new file mode 100644
index 0000000000..1c5fa7fdbe
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsT2IRetrieval.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class VisualNewsT2IRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="VisualNewsT2IRetrieval",
+ description="Retrieve news images with captions.",
+ reference="https://aclanthology.org/2021.emnlp-main.542/",
+ dataset={
+ "path": "MRBench/mbeir_visualnews_task0",
+ "revision": "94c519d850dba2b0058c2fc9b5da6142a59aa285",
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2020-01-01", "2020-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image", "text"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{liu2021visual,
+ title={Visual News: Benchmark and Challenges in News Image Captioning},
+ author={Liu, Fuxiao and Wang, Yinghan and Wang, Tianlu and Ordonez, Vicente},
+ booktitle={Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
+ pages={6761--6771},
+ year={2021}
+}""",
+ prompt={
+ "query": "Identify the news-related image in line with the described event."
+ },
+ descriptive_stats={
+ "n_samples": {"test": 19995},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 542246,
+ "num_queries": 19995,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/VizWizIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/VizWizIT2TRetrieval.py
new file mode 100644
index 0000000000..96bcac96c3
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/VizWizIT2TRetrieval.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class VizWizIT2TRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="VizWizIT2TRetrieval",
+ description="Retrieve the correct answer for a question about an image.",
+ reference="https://openaccess.thecvf.com/content_cvpr_2018/papers/Gurari_VizWiz_Grand_Challenge_CVPR_2018_paper.pdf",
+ dataset={
+ "path": "JamieSJS/vizwiz",
+ "revision": "044af162d55f82ab603fa16ffcf7f1e4dbf300e9",
+ "trust_remote_code": True,
+ },
+ type="Any2AnyRetrieval",
+ category="it2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2018-01-01", "2018-01-01"),
+ domains=["Web"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@inproceedings{gurari2018vizwiz,
+ title={Vizwiz grand challenge: Answering visual questions from blind people},
+ author={Gurari, Danna and Li, Qing and Stangl, Abigale J and Guo, Anhong and Lin, Chi and Grauman, Kristen and Luo, Jiebo and Bigham, Jeffrey P},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={3608--3617},
+ year={2018}
+}
+
+""",
+ descriptive_stats={
+ "n_samples": {"test": 214354},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 2143540,
+ "num_queries": 214354,
+ "average_relevant_docs_per_query": 1,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2ITRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2ITRetrieval.py
new file mode 100644
index 0000000000..e3235c4912
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2ITRetrieval.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class WebQAT2ITRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="WebQAT2ITRetrieval",
+ description="Retrieve sources of information based on questions.",
+ reference="https://openaccess.thecvf.com/content/CVPR2022/html/Chang_WebQA_Multihop_and_Multimodal_QA_CVPR_2022_paper.html",
+ dataset={
+ "path": "MRBench/mbeir_webqa_task2",
+ "revision": "53db4c9f9c93cb74926a1c9d04dea7d7acac2f21",
+ },
+ type="Any2AnyRetrieval",
+ category="t2it",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2022-01-01", "2022-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image", "text"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{chang2022webqa,
+ title={Webqa: Multihop and multimodal qa},
+ author={Chang, Yingshan and Narang, Mridu and Suzuki, Hisami and Cao, Guihong and Gao, Jianfeng and Bisk, Yonatan},
+ booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
+ pages={16495--16504},
+ year={2022}
+ }""",
+ prompt={"query": "Find a Wikipedia image that answers this question."},
+ descriptive_stats={
+ "n_samples": {"test": 2511},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 403196,
+ "num_queries": 2511,
+ "average_relevant_docs_per_query": 1.4,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2TRetrieval.py
new file mode 100644
index 0000000000..4583e61221
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2TRetrieval.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class WebQAT2TRetrieval(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="WebQAT2TRetrieval",
+ description="Retrieve sources of information based on questions.",
+ reference="https://openaccess.thecvf.com/content/CVPR2022/html/Chang_WebQA_Multihop_and_Multimodal_QA_CVPR_2022_paper.html",
+ dataset={
+ "path": "MRBench/mbeir_webqa_task1",
+ "revision": "468b42a2b2e767d80d2d93f5ae5d42f135a10478",
+ },
+ type="Any2AnyRetrieval",
+ category="t2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="ndcg_at_10",
+ date=("2022-01-01", "2022-12-31"),
+ domains=["Encyclopaedic"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{chang2022webqa,
+ title={Webqa: Multihop and multimodal qa},
+ author={Chang, Yingshan and Narang, Mridu and Suzuki, Hisami and Cao, Guihong and Gao, Jianfeng and Bisk, Yonatan},
+ booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
+ pages={16495--16504},
+ year={2022}
+ }""",
+ prompt={
+ "query": "Retrieve passages from Wikipedia that provide answers to the following question."
+ },
+ descriptive_stats={
+ "n_samples": {"test": 2455},
+ "avg_character_length": {
+ "test": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 544457,
+ "num_queries": 2455,
+ "average_relevant_docs_per_query": 2.0,
+ }
+ },
+ },
+ )
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/__init__.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/WITT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/WITT2IRetrieval.py
new file mode 100644
index 0000000000..5f20e45d25
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/WITT2IRetrieval.py
@@ -0,0 +1,223 @@
+from __future__ import annotations
+
+from datasets import Dataset, DatasetDict, load_dataset
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.MultilingualTask import MultilingualTask
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+_LANGUAGES = {
+ "ar": ["ara-Arab"],
+ "bg": ["bul-Cyrl"],
+ "da": ["dan-Latn"],
+ "el": ["ell-Grek"],
+ "et": ["est-Latn"],
+ "id": ["ind-Latn"],
+ "ko": ["kor-Hang"],
+ "ja": ["jpn-Jpan"],
+ "tr": ["tur-Latn"],
+ "vi": ["vie-Latn"],
+ "en": ["eng-Latn"],
+}
+
+
+def _load_wit_data(
+ path: str, langs: list, splits: str, cache_dir: str = None, revision: str = None
+):
+ corpus = {lang: {split: None for split in splits} for lang in langs}
+ queries = {lang: {split: None for split in splits} for lang in langs}
+ relevant_docs = {lang: {split: None for split in splits} for lang in langs}
+
+ split = "test"
+
+ for lang in langs:
+ lang_data = load_dataset(
+ path,
+ split=lang,
+ cache_dir=cache_dir,
+ revision=revision,
+ # trust_remote_code=True,
+ )
+ lang_corpus = lang_data.map(
+ lambda x: {
+ "id": "corpus-" + x["image_id"],
+ "text": None,
+ "modality": "image",
+ "image": x["image"],
+ },
+ remove_columns=[
+ "captions",
+ "image_id",
+ ],
+ )
+
+ corpus[lang][split] = lang_corpus
+
+ lang_data = lang_data.remove_columns(["image"])
+
+ queries[lang][split] = []
+ relevant_docs[lang][split] = {}
+
+ for row in lang_data:
+ image_id = "corpus-" + row["image_id"]
+ for idx, caption in enumerate(row["captions"]):
+ query_id = f"query-{row['image_id']}-{idx}"
+ queries[lang][split].append(
+ {
+ "id": query_id,
+ "text": caption,
+ "modality": "text",
+ "image": None,
+ }
+ )
+ if query_id not in relevant_docs[lang][split]:
+ relevant_docs[lang][split][query_id] = {}
+ relevant_docs[lang][split][query_id][image_id] = 1
+
+ queries[lang][split] = Dataset.from_dict(
+ {
+ "id": [query["id"] for query in queries[lang][split]],
+ "text": [query["text"] for query in queries[lang][split]],
+ "modality": [query["modality"] for query in queries[lang][split]],
+ "image": [None for _ in queries[lang][split]],
+ }
+ )
+ corpus = DatasetDict({lang: DatasetDict(splits) for lang, splits in corpus.items()})
+ queries = DatasetDict(
+ {lang: DatasetDict(splits) for lang, splits in queries.items()}
+ )
+ relevant_docs = DatasetDict(relevant_docs)
+
+ return corpus, queries, relevant_docs
+
+
+class WITT2IRetrieval(MultilingualTask, AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="WITT2IRetrieval",
+ description="Retrieve images based on multilingual descriptions.",
+ reference="https://proceedings.mlr.press/v162/bugliarello22a/bugliarello22a.pdf",
+ dataset={
+ "path": "mteb/wit",
+ "revision": "91ac153f1371a98b209ed763205e25e115ecd06e",
+ # "trust_remote_code": True,
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=_LANGUAGES,
+ main_score="ndcg_at_10",
+ date=("2022-01-01", "2022-12-31"),
+ domains=["Encyclopaedic", "Written"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@inproceedings{bugliarello2022iglue,
+ title={IGLUE: A benchmark for transfer learning across modalities, tasks, and languages},
+ author={Bugliarello, Emanuele and Liu, Fangyu and Pfeiffer, Jonas and Reddy, Siva and Elliott, Desmond and Ponti, Edoardo Maria and Vuli{\'c}, Ivan},
+ booktitle={International Conference on Machine Learning},
+ pages={2370--2392},
+ year={2022},
+ organization={PMLR}
+}""",
+ descriptive_stats={
+ "n_samples": None,
+ "avg_character_length": {
+ "test": {
+ "ar": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 792,
+ "num_queries": 890,
+ "average_relevant_docs_per_query": 0.89,
+ },
+ "bg": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 806,
+ "num_queries": 890,
+ "average_relevant_docs_per_query": 0.91,
+ },
+ "da": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 814,
+ "num_queries": 890,
+ "average_relevant_docs_per_query": 0.91,
+ },
+ "el": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 541,
+ "num_queries": 890,
+ "average_relevant_docs_per_query": 0.61,
+ },
+ "et": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 780,
+ "num_queries": 890,
+ "average_relevant_docs_per_query": 0.88,
+ },
+ "id": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 854,
+ "num_queries": 890,
+ "average_relevant_docs_per_query": 0.96,
+ },
+ "ja": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 842,
+ "num_queries": 890,
+ "average_relevant_docs_per_query": 0.95,
+ },
+ "ko": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 889,
+ "num_queries": 890,
+ "average_relevant_docs_per_query": 1.0,
+ },
+ "tr": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 681,
+ "num_queries": 890,
+ "average_relevant_docs_per_query": 0.77,
+ },
+ "vi": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 869,
+ "num_queries": 890,
+ "average_relevant_docs_per_query": 0.98,
+ },
+ "en": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 685,
+ "num_queries": 890,
+ "average_relevant_docs_per_query": 0.77,
+ },
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ if self.data_loaded:
+ return
+
+ self.corpus, self.queries, self.relevant_docs = _load_wit_data(
+ path=self.metadata_dict["dataset"]["path"],
+ langs=self.hf_subsets,
+ splits=self.metadata_dict["eval_splits"],
+ cache_dir=kwargs.get("cache_dir", None),
+ revision=self.metadata_dict["dataset"]["revision"],
+ )
+
+ self.data_loaded = True
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XFlickr30kCoT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XFlickr30kCoT2IRetrieval.py
new file mode 100644
index 0000000000..98b45006a2
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XFlickr30kCoT2IRetrieval.py
@@ -0,0 +1,185 @@
+from __future__ import annotations
+
+from datasets import DatasetDict, load_dataset
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.MultilingualTask import MultilingualTask
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+_LANGUAGES = {
+ "de": ["deu-Latn"],
+ "en": ["eng-Latn"],
+ "es": ["spa-Latn"],
+ "id": ["ind-Latn"],
+ "ja": ["jpn-Jpan"],
+ "ru": ["rus-Cyrl"],
+ "tr": ["tur-Latn"],
+ "zh": ["zho-Hans"],
+}
+
+
+def _load_xflickrco_data(
+ path: str, langs: list, splits: str, cache_dir: str = None, revision: str = None
+):
+ corpus = {lang: {split: None for split in splits} for lang in langs}
+ queries = {lang: {split: None for split in splits} for lang in langs}
+ relevant_docs = {lang: {split: None for split in splits} for lang in langs}
+
+ split = "test"
+
+ for lang in langs:
+ lang_data = load_dataset(
+ path,
+ cache_dir=cache_dir,
+ revision=revision,
+ # trust_remote_code=True,
+ )[lang]
+ lang_corpus = lang_data.map(
+ lambda x: {
+ "id": "corpus-" + x["id"],
+ "text": None,
+ "modality": "image",
+ "image": x["image"]["bytes"],
+ },
+ remove_columns=["sentences"],
+ )
+
+ lang_queries = lang_data.map(
+ lambda x: {
+ "id": "query-" + x["id"],
+ "text": x["sentences"],
+ "modality": "text",
+ "image": None,
+ },
+ remove_columns=["sentences"],
+ )
+
+ relevant_docs[lang][split] = {}
+ for row in lang_data:
+ query_id = "query-" + row["id"]
+ corpus_id = "corpus-" + row["id"]
+ score = 1
+ if query_id not in relevant_docs[lang][split]:
+ relevant_docs[lang][split][query_id] = {}
+ relevant_docs[lang][split][query_id][corpus_id] = score
+
+ corpus[lang][split] = lang_corpus
+ queries[lang][split] = lang_queries
+
+ corpus = DatasetDict({lang: DatasetDict(splits) for lang, splits in corpus.items()})
+ queries = DatasetDict(
+ {lang: DatasetDict(splits) for lang, splits in queries.items()}
+ )
+ relevant_docs = DatasetDict(relevant_docs)
+ return corpus, queries, relevant_docs
+
+
+class XFlickr30kCoT2IRetrieval(MultilingualTask, AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="XFlickr30kCoT2IRetrieval",
+ description="Retrieve images based on multilingual descriptions.",
+ reference="https://proceedings.mlr.press/v162/bugliarello22a/bugliarello22a.pdf",
+ dataset={
+ "path": "floschne/xflickrco",
+ "revision": "0af2c2eba58b27a71898787e286be04befdd7a20",
+ # "trust_remote_code": True,
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=_LANGUAGES,
+ main_score="ndcg_at_10",
+ date=("2022-01-01", "2022-12-31"),
+ domains=["Encyclopaedic", "Written"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@inproceedings{bugliarello2022iglue,
+ title={IGLUE: A benchmark for transfer learning across modalities, tasks, and languages},
+ author={Bugliarello, Emanuele and Liu, Fangyu and Pfeiffer, Jonas and Reddy, Siva and Elliott, Desmond and Ponti, Edoardo Maria and Vuli{\'c}, Ivan},
+ booktitle={International Conference on Machine Learning},
+ pages={2370--2392},
+ year={2022},
+ organization={PMLR}
+}""",
+ descriptive_stats={
+ "n_samples": None,
+ "avg_character_length": {
+ "test": {
+ "de": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 2000,
+ "num_queries": 2000,
+ "average_relevant_docs_per_query": 1.0,
+ },
+ "en": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 2000,
+ "num_queries": 2000,
+ "average_relevant_docs_per_query": 1.0,
+ },
+ "es": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 2000,
+ "num_queries": 2000,
+ "average_relevant_docs_per_query": 1.0,
+ },
+ "id": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 2000,
+ "num_queries": 2000,
+ "average_relevant_docs_per_query": 1.0,
+ },
+ "ja": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 2000,
+ "num_queries": 2000,
+ "average_relevant_docs_per_query": 1.0,
+ },
+ "ru": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 2000,
+ "num_queries": 2000,
+ "average_relevant_docs_per_query": 1.0,
+ },
+ "tr": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 2000,
+ "num_queries": 2000,
+ "average_relevant_docs_per_query": 1.0,
+ },
+ "zh": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 2000,
+ "num_queries": 2000,
+ "average_relevant_docs_per_query": 1.0,
+ },
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ if self.data_loaded:
+ return
+
+ self.corpus, self.queries, self.relevant_docs = _load_xflickrco_data(
+ path=self.metadata_dict["dataset"]["path"],
+ langs=self.hf_subsets,
+ splits=self.metadata_dict["eval_splits"],
+ cache_dir=kwargs.get("cache_dir", None),
+ revision=self.metadata_dict["dataset"]["revision"],
+ )
+
+ self.data_loaded = True
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XM3600T2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XM3600T2IRetrieval.py
new file mode 100644
index 0000000000..a65d37f324
--- /dev/null
+++ b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XM3600T2IRetrieval.py
@@ -0,0 +1,418 @@
+from __future__ import annotations
+
+from datasets import Dataset, DatasetDict, load_dataset
+
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.MultilingualTask import MultilingualTask
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+_LANGUAGES = {
+ "ar": ["ara-Arab"],
+ "bn": ["ben-Beng"],
+ "cs": ["ces-Latn"],
+ "da": ["dan-Latn"],
+ "de": ["deu-Latn"],
+ "el": ["ell-Grek"],
+ "en": ["eng-Latn"],
+ "es": ["spa-Latn"],
+ "fa": ["fas-Arab"],
+ "fi": ["fin-Latn"],
+ "fil": ["fil-Latn"],
+ "fr": ["fra-Latn"],
+ "he": ["heb-Hebr"],
+ "hi": ["hin-Deva"],
+ "hr": ["hrv-Latn"],
+ "hu": ["hun-Latn"],
+ "id": ["ind-Latn"],
+ "it": ["ita-Latn"],
+ "ja": ["jpn-Jpan"],
+ "ko": ["kor-Hang"],
+ "mi": ["mri-Latn"],
+ "nl": ["nld-Latn"],
+ "no": ["nor-Latn"],
+ "pl": ["pol-Latn"],
+ "pt": ["por-Latn"],
+ "quz": ["quz-Latn"],
+ "ro": ["ron-Latn"],
+ "ru": ["rus-Cyrl"],
+ "sv": ["swe-Latn"],
+ "sw": ["swa-Latn"],
+ "te": ["tel-Telu"],
+ "th": ["tha-Thai"],
+ "tr": ["tur-Latn"],
+ "uk": ["ukr-Cyrl"],
+ "vi": ["vie-Latn"],
+ "zh": ["zho-Hans"],
+}
+
+
+def _load_xm3600_data(
+ path: str, langs: list, splits: str, cache_dir: str = None, revision: str = None
+):
+ corpus = {lang: {split: None for split in splits} for lang in langs}
+ queries = {lang: {split: None for split in splits} for lang in langs}
+ relevant_docs = {lang: {split: None for split in splits} for lang in langs}
+
+ split = "test"
+
+ for lang in langs:
+ lang_data = load_dataset(
+ path,
+ split=lang,
+ cache_dir=cache_dir,
+ revision=revision,
+ # trust_remote_code=True,
+ )
+ lang_corpus = lang_data.map(
+ lambda x: {
+ "id": "corpus-" + x["image_id"],
+ "text": None,
+ "modality": "image",
+ "image": x["image"]["bytes"],
+ },
+ remove_columns=[
+ "captions",
+ "captions_tokenized",
+ "captions_tokenized_lowercase",
+ "image_locale",
+ "image_id",
+ ],
+ )
+
+ corpus[lang][split] = lang_corpus
+
+ lang_data = lang_data.remove_columns(["image"])
+
+ queries[lang][split] = []
+ relevant_docs[lang][split] = {}
+
+ for row in lang_data:
+ image_id = "corpus-" + row["image_id"]
+ for idx, caption in enumerate(row["captions"]):
+ query_id = f"query-{row['image_id']}-{idx}"
+ queries[lang][split].append(
+ {
+ "id": query_id,
+ "text": caption,
+ "modality": "text",
+ "image": None,
+ }
+ )
+ if query_id not in relevant_docs[lang][split]:
+ relevant_docs[lang][split][query_id] = {}
+ relevant_docs[lang][split][query_id][image_id] = 1
+
+ queries[lang][split] = Dataset.from_dict(
+ {
+ "id": [query["id"] for query in queries[lang][split]],
+ "text": [query["text"] for query in queries[lang][split]],
+ "modality": [query["modality"] for query in queries[lang][split]],
+ "image": [None for _ in queries[lang][split]],
+ }
+ )
+ corpus = DatasetDict({lang: DatasetDict(splits) for lang, splits in corpus.items()})
+ queries = DatasetDict(
+ {lang: DatasetDict(splits) for lang, splits in queries.items()}
+ )
+ relevant_docs = DatasetDict(relevant_docs)
+
+ return corpus, queries, relevant_docs
+
+
+class XM3600T2IRetrieval(MultilingualTask, AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ name="XM3600T2IRetrieval",
+ description="Retrieve images based on multilingual descriptions.",
+ reference="https://aclanthology.org/2022.emnlp-main.45/",
+ dataset={
+ "path": "floschne/xm3600",
+ "revision": "8d3e5665526c55a5855cd6ddfbaba2032bc7cee4",
+ # "trust_remote_code": True,
+ },
+ type="Any2AnyRetrieval",
+ category="t2i",
+ eval_splits=["test"],
+ eval_langs=_LANGUAGES,
+ main_score="ndcg_at_10",
+ date=("2022-01-01", "2022-12-31"),
+ domains=["Encyclopaedic", "Written"],
+ task_subtypes=["Image Text Retrieval"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@inproceedings{thapliyal2022crossmodal,
+ title={Crossmodal-3600: A Massively Multilingual Multimodal Evaluation Dataset},
+ author={Thapliyal, Ashish V and Tuset, Jordi Pont and Chen, Xi and Soricut, Radu},
+ booktitle={Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing},
+ pages={715--729},
+ year={2022}
+}""",
+ descriptive_stats={
+ "n_samples": None,
+ "avg_character_length": {
+ "test": {
+ "ar": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "bn": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "cs": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "da": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "de": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "el": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "en": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "es": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "fa": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "fi": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "fil": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "fr": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "he": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "hi": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "hr": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "hu": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "id": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "it": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "ja": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "ko": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "mi": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "nl": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "pl": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "pt": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "quz": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "ro": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "ru": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "sv": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "sw": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "te": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "th": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "tr": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "uk": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "vi": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ "zh": {
+ "average_document_length": 0.0,
+ "average_query_length": 0.0,
+ "num_documents": 7200,
+ "num_queries": 3600,
+ "average_relevant_docs_per_query": 2.0,
+ },
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ if self.data_loaded:
+ return
+
+ self.corpus, self.queries, self.relevant_docs = _load_xm3600_data(
+ path=self.metadata_dict["dataset"]["path"],
+ langs=self.hf_subsets,
+ splits=self.metadata_dict["eval_splits"],
+ cache_dir=kwargs.get("cache_dir", None),
+ revision=self.metadata_dict["dataset"]["revision"],
+ )
+
+ self.data_loaded = True
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/__init__.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/mteb/tasks/Image/Any2TextMultipleChoice/__init__.py b/mteb/tasks/Image/Any2TextMultipleChoice/__init__.py
new file mode 100644
index 0000000000..e1433ec949
--- /dev/null
+++ b/mteb/tasks/Image/Any2TextMultipleChoice/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from .eng.CVBench import *
diff --git a/mteb/tasks/Image/Any2TextMultipleChoice/eng/CVBench.py b/mteb/tasks/Image/Any2TextMultipleChoice/eng/CVBench.py
new file mode 100644
index 0000000000..3f387fdcbf
--- /dev/null
+++ b/mteb/tasks/Image/Any2TextMultipleChoice/eng/CVBench.py
@@ -0,0 +1,258 @@
+from __future__ import annotations
+
+import datasets
+
+from mteb.abstasks.Image.AbsTaskAny2TextMultipleChoice import (
+ AbsTaskAny2TextMultipleChoice,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+def transform_choices(example):
+ mapping = {"(A)": 0, "(B)": 1, "(C)": 2, "(D)": 3, "(E)": 4, "(F)": 5}
+ example["answer"] = mapping[example["answer"]]
+ return example
+
+
+class CVBenchCount(AbsTaskAny2TextMultipleChoice):
+ metadata = TaskMetadata(
+ name="CVBenchCount",
+ description="count the number of objects in the image.",
+ reference="https://arxiv.org/pdf/2406.16860",
+ dataset={
+ "path": "nyu-visionx/CV-Bench",
+ "revision": "22409a927ab5cf68e3655023d51694587455fc99",
+ },
+ type="Any2TextMutipleChoice",
+ category="it2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2024-01-01", "2024-06-24"),
+ domains=["Academic"],
+ task_subtypes=["Question answering"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{tong2024cambrian,
+ title={Cambrian-1: A fully open, vision-centric exploration of multimodal llms},
+ author={Tong, Shengbang and Brown, Ellis and Wu, Penghao and Woo, Sanghyun and Middepogu, Manoj and Akula, Sai Charitha and Yang, Jihan and Yang, Shusheng and Iyer, Adithya and Pan, Xichen and others},
+ journal={arXiv preprint arXiv:2406.16860},
+ year={2024}
+}""",
+ descriptive_stats={
+ "n_samples": {"test": 788},
+ "avg_character_length": {
+ "test": {
+ # to do
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ self.dataset = datasets.load_dataset(**self.metadata_dict["dataset"])
+ self.dataset_transform()
+ self.dataset = self.dataset.filter(lambda example: example["task"] == "Count")
+ self.dataset = self.dataset.map(
+ transform_choices,
+ remove_columns=[
+ "idx",
+ "type",
+ "filename",
+ "source",
+ "source_dataset",
+ "source_filename",
+ "target_class",
+ "target_size",
+ "bbox",
+ "prompt",
+ ],
+ )
+ self.data_loaded = True
+
+
+class CVBenchRelation(AbsTaskAny2TextMultipleChoice):
+ metadata = TaskMetadata(
+ name="CVBenchRelation",
+ description="decide the relation of the objects in the image.",
+ reference="https://arxiv.org/pdf/2406.16860",
+ dataset={
+ "path": "nyu-visionx/CV-Bench",
+ "revision": "22409a927ab5cf68e3655023d51694587455fc99",
+ },
+ type="Any2TextMutipleChoice",
+ category="it2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2024-01-01", "2024-06-24"),
+ domains=["Academic"],
+ task_subtypes=["Question answering"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{tong2024cambrian,
+ title={Cambrian-1: A fully open, vision-centric exploration of multimodal llms},
+ author={Tong, Shengbang and Brown, Ellis and Wu, Penghao and Woo, Sanghyun and Middepogu, Manoj and Akula, Sai Charitha and Yang, Jihan and Yang, Shusheng and Iyer, Adithya and Pan, Xichen and others},
+ journal={arXiv preprint arXiv:2406.16860},
+ year={2024}
+}""",
+ descriptive_stats={
+ "n_samples": {"test": 650},
+ "avg_character_length": {
+ "test": {
+ # to do
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ self.dataset = datasets.load_dataset(**self.metadata_dict["dataset"])
+ self.dataset_transform()
+ self.dataset = self.dataset.filter(
+ lambda example: example["task"] == "Relation"
+ )
+ self.dataset = self.dataset.map(
+ transform_choices,
+ remove_columns=[
+ "idx",
+ "type",
+ "filename",
+ "source",
+ "source_dataset",
+ "source_filename",
+ "target_class",
+ "target_size",
+ "bbox",
+ "prompt",
+ ],
+ )
+ self.data_loaded = True
+
+
+class CVBenchDepth(AbsTaskAny2TextMultipleChoice):
+ metadata = TaskMetadata(
+ name="CVBenchDepth",
+ description="judge the depth of the objects in the image with similarity matching.",
+ reference="https://arxiv.org/pdf/2406.16860",
+ dataset={
+ "path": "nyu-visionx/CV-Bench",
+ "revision": "22409a927ab5cf68e3655023d51694587455fc99",
+ },
+ type="Any2TextMutipleChoice",
+ category="it2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2024-01-01", "2024-06-24"),
+ domains=["Academic"],
+ task_subtypes=["Question answering"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{tong2024cambrian,
+ title={Cambrian-1: A fully open, vision-centric exploration of multimodal llms},
+ author={Tong, Shengbang and Brown, Ellis and Wu, Penghao and Woo, Sanghyun and Middepogu, Manoj and Akula, Sai Charitha and Yang, Jihan and Yang, Shusheng and Iyer, Adithya and Pan, Xichen and others},
+ journal={arXiv preprint arXiv:2406.16860},
+ year={2024}
+}""",
+ descriptive_stats={
+ "n_samples": {"test": 600},
+ "avg_character_length": {
+ "test": {
+ # to do
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ self.dataset = datasets.load_dataset(**self.metadata_dict["dataset"])
+ self.dataset_transform()
+ self.dataset = self.dataset.filter(lambda example: example["task"] == "Depth")
+ self.dataset = self.dataset.map(
+ transform_choices,
+ remove_columns=[
+ "idx",
+ "type",
+ "filename",
+ "source",
+ "source_dataset",
+ "source_filename",
+ "target_class",
+ "target_size",
+ "bbox",
+ "prompt",
+ ],
+ )
+ self.data_loaded = True
+
+
+class CVBenchDistance(AbsTaskAny2TextMultipleChoice):
+ metadata = TaskMetadata(
+ name="CVBenchDistance",
+ description="judge the distance of the objects in the image with similarity matching.",
+ reference="https://arxiv.org/pdf/2406.16860",
+ dataset={
+ "path": "nyu-visionx/CV-Bench",
+ "revision": "22409a927ab5cf68e3655023d51694587455fc99",
+ },
+ type="Any2TextMutipleChoice",
+ category="it2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2024-01-01", "2024-06-24"),
+ domains=["Academic"],
+ task_subtypes=["Question answering"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="found",
+ bibtex_citation="""@article{tong2024cambrian,
+ title={Cambrian-1: A fully open, vision-centric exploration of multimodal llms},
+ author={Tong, Shengbang and Brown, Ellis and Wu, Penghao and Woo, Sanghyun and Middepogu, Manoj and Akula, Sai Charitha and Yang, Jihan and Yang, Shusheng and Iyer, Adithya and Pan, Xichen and others},
+ journal={arXiv preprint arXiv:2406.16860},
+ year={2024}
+}""",
+ descriptive_stats={
+ "n_samples": {"test": 600},
+ "avg_character_length": {
+ "test": {
+ # to do
+ }
+ },
+ },
+ )
+
+ def load_data(self, **kwargs):
+ self.dataset = datasets.load_dataset(**self.metadata_dict["dataset"])
+ self.dataset_transform()
+ self.dataset = self.dataset.filter(
+ lambda example: example["task"] == "Distance"
+ )
+ self.dataset = self.dataset.map(
+ transform_choices,
+ remove_columns=[
+ "idx",
+ "type",
+ "filename",
+ "source",
+ "source_dataset",
+ "source_filename",
+ "target_class",
+ "target_size",
+ "bbox",
+ "prompt",
+ ],
+ )
+ self.data_loaded = True
diff --git a/mteb/tasks/Image/Any2TextMultipleChoice/eng/__init__.py b/mteb/tasks/Image/Any2TextMultipleChoice/eng/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/mteb/tasks/Image/Clustering/__init__.py b/mteb/tasks/Image/Clustering/__init__.py
new file mode 100644
index 0000000000..804870ebeb
--- /dev/null
+++ b/mteb/tasks/Image/Clustering/__init__.py
@@ -0,0 +1,5 @@
+from __future__ import annotations
+
+from .eng.CIFAR import *
+from .eng.ImageNet import *
+from .eng.TinyImageNet import *
diff --git a/mteb/tasks/Image/Clustering/eng/CIFAR.py b/mteb/tasks/Image/Clustering/eng/CIFAR.py
new file mode 100644
index 0000000000..a10906d105
--- /dev/null
+++ b/mteb/tasks/Image/Clustering/eng/CIFAR.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClustering import AbsTaskImageClustering
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class CIFAR10Clustering(AbsTaskImageClustering):
+ metadata = TaskMetadata(
+ name="CIFAR10Clustering",
+ description="Clustering images from 10 classes.",
+ reference="https://huggingface.co/datasets/uoft-cs/cifar10",
+ dataset={
+ "path": "uoft-cs/cifar10",
+ "revision": "0b2714987fa478483af9968de7c934580d0bb9a2",
+ },
+ type="ImageClustering",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="nmi",
+ date=(
+ "2008-01-01",
+ "2009-01-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation=""" @TECHREPORT{Krizhevsky09learningmultiple,
+ author = {Alex Krizhevsky},
+ title = {Learning multiple layers of features from tiny images},
+ institution = {},
+ year = {2009}
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 10000},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
+
+ image_column_name: str = "img"
+
+
+class CIFAR100Clustering(AbsTaskImageClustering):
+ metadata = TaskMetadata(
+ name="CIFAR100Clustering",
+ description="Clustering images from 100 classes.",
+ reference="https://huggingface.co/datasets/uoft-cs/cifar100",
+ dataset={
+ "path": "uoft-cs/cifar100",
+ "revision": "aadb3af77e9048adbea6b47c21a81e47dd092ae5",
+ },
+ type="ImageClustering",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="nmi",
+ date=(
+ "2008-01-01",
+ "2009-01-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation=""" @TECHREPORT{Krizhevsky09learningmultiple,
+ author = {Alex Krizhevsky},
+ title = {Learning multiple layers of features from tiny images},
+ institution = {},
+ year = {2009}
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 10000},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
+ image_column_name: str = "img"
+ label_column_name: str = "fine_label"
diff --git a/mteb/tasks/Image/Clustering/eng/ImageNet.py b/mteb/tasks/Image/Clustering/eng/ImageNet.py
new file mode 100644
index 0000000000..aa0ab5720b
--- /dev/null
+++ b/mteb/tasks/Image/Clustering/eng/ImageNet.py
@@ -0,0 +1,84 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClustering import AbsTaskImageClustering
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class ImageNetDog15Clustering(AbsTaskImageClustering):
+ metadata = TaskMetadata(
+ name="ImageNetDog15Clustering",
+ description="Clustering images from a 15-class dogs-only subset of the dog classes in ImageNet.",
+ reference="http://vision.stanford.edu/aditya86/ImageNetDogs/main.html",
+ dataset={
+ "path": "JamieSJS/imagenet-dog-15",
+ "revision": "bfb6ad3b2109d26c9daddf14f98d315daa35ee72",
+ },
+ type="ImageClustering",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="nmi",
+ date=("2009-06-20", "2009-06-20"), # Conference date
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation=""" @INPROCEEDINGS{5206848,
+ author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Kai Li and Li Fei-Fei},
+ booktitle={2009 IEEE Conference on Computer Vision and Pattern Recognition},
+ title={ImageNet: A large-scale hierarchical image database},
+ year={2009},
+ volume={},
+ number={},
+ pages={248-255},
+ keywords={Large-scale systems;Image databases;Explosions;Internet;Robustness;Information retrieval;Image retrieval;Multimedia databases;Ontologies;Spine},
+ doi={10.1109/CVPR.2009.5206848}}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 1076, "train": 1500},
+ # "avg_character_length": {"test": 431.4},
+ },
+ )
+
+
+class ImageNet10Clustering(AbsTaskImageClustering):
+ metadata = TaskMetadata(
+ name="ImageNet10Clustering",
+ description="Clustering images from an 10-class subset of ImageNet which are generally easy to distinguish.",
+ reference="https://www.kaggle.com/datasets/liusha249/imagenet10",
+ dataset={
+ "path": "JamieSJS/imagenet-10",
+ "revision": "88f8a6d47c257895094c5ad81e67ba751771fc99",
+ },
+ type="ImageClustering",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="nmi",
+ date=("2009-06-20", "2009-06-20"), # Conference date
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation=""" @INPROCEEDINGS{5206848,
+ author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Kai Li and Li Fei-Fei},
+ booktitle={2009 IEEE Conference on Computer Vision and Pattern Recognition},
+ title={ImageNet: A large-scale hierarchical image database},
+ year={2009},
+ volume={},
+ number={},
+ pages={248-255},
+ keywords={Large-scale systems;Image databases;Explosions;Internet;Robustness;Information retrieval;Image retrieval;Multimedia databases;Ontologies;Spine},
+ doi={10.1109/CVPR.2009.5206848}}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 13000},
+ # "avg_character_length": {"test": 431.4},
+ },
+ )
diff --git a/mteb/tasks/Image/Clustering/eng/TinyImageNet.py b/mteb/tasks/Image/Clustering/eng/TinyImageNet.py
new file mode 100644
index 0000000000..d49ebbfde6
--- /dev/null
+++ b/mteb/tasks/Image/Clustering/eng/TinyImageNet.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClustering import AbsTaskImageClustering
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class TinyImageNet(AbsTaskImageClustering):
+ metadata = TaskMetadata(
+ name="TinyImageNetClustering",
+ description="Clustering over 200 classes.",
+ reference="https://huggingface.co/datasets/zh-plus/tiny-imagenet/viewer/default/valid",
+ dataset={
+ "path": "zh-plus/tiny-imagenet",
+ "revision": "5a77092c28e51558c5586e9c5eb71a7e17a5e43f",
+ },
+ type="ImageClustering",
+ category="i2i",
+ eval_splits=["valid"],
+ eval_langs=["eng-Latn"],
+ main_score="nmi",
+ date=(
+ "2012-01-01",
+ "2015-12-31",
+ ), # Estimated range for the collection of reviews
+ domains=["Reviews"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="found",
+ bibtex_citation="""d""",
+ descriptive_stats={
+ "n_samples": {"valid": 10000},
+ "avg_character_length": {"valid": 431.4},
+ },
+ )
diff --git a/mteb/tasks/Image/Clustering/eng/__init__.py b/mteb/tasks/Image/Clustering/eng/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/mteb/tasks/Image/ImageClassification/__init__.py b/mteb/tasks/Image/ImageClassification/__init__.py
new file mode 100644
index 0000000000..c5a82f357d
--- /dev/null
+++ b/mteb/tasks/Image/ImageClassification/__init__.py
@@ -0,0 +1,22 @@
+from __future__ import annotations
+
+from .eng.BirdsnapClassification import *
+from .eng.Caltech101Classification import *
+from .eng.CIFAR import *
+from .eng.Country211Classification import *
+from .eng.DTDClassification import *
+from .eng.EuroSATClassification import *
+from .eng.FER2013Classification import *
+from .eng.FGVCAircraftClassification import *
+from .eng.Food101Classification import *
+from .eng.GTSRBClassification import *
+from .eng.Imagenet1k import *
+from .eng.MNISTClassification import *
+from .eng.OxfordFlowersClassification import *
+from .eng.OxfordPetsClassification import *
+from .eng.PatchCamelyonClassification import *
+from .eng.RESISC45Classification import *
+from .eng.StanfordCarsClassification import *
+from .eng.STL10Classification import *
+from .eng.SUN397Classification import *
+from .eng.UCF101Classification import *
diff --git a/mteb/tasks/Image/ImageClassification/eng/BirdsnapClassification.py b/mteb/tasks/Image/ImageClassification/eng/BirdsnapClassification.py
new file mode 100644
index 0000000000..2e11094b09
--- /dev/null
+++ b/mteb/tasks/Image/ImageClassification/eng/BirdsnapClassification.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class BirdsnapClassification(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ name="Birdsnap",
+ description="Classifying bird images from 500 species.",
+ reference="https://openaccess.thecvf.com/content_cvpr_2014/html/Berg_Birdsnap_Large-scale_Fine-grained_2014_CVPR_paper.html",
+ dataset={
+ "path": "isaacchung/birdsnap",
+ "revision": "fd23015508be94f0b5b59d61630e4ea2536509e4",
+ },
+ type="ImageClassification",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2013-01-01",
+ "2014-01-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@InProceedings{Berg_2014_CVPR,
+ author = {Berg, Thomas and Liu, Jiongxin and Woo Lee, Seung and Alexander, Michelle L. and Jacobs, David W. and Belhumeur, Peter N.},
+ title = {Birdsnap: Large-scale Fine-grained Visual Categorization of Birds},
+ booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ month = {June},
+ year = {2014}
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 1851},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
+
+ # Override default column name in the subclass
+ label_column_name: str = "common"
diff --git a/mteb/tasks/Image/ImageClassification/eng/CIFAR.py b/mteb/tasks/Image/ImageClassification/eng/CIFAR.py
new file mode 100644
index 0000000000..abed2ad617
--- /dev/null
+++ b/mteb/tasks/Image/ImageClassification/eng/CIFAR.py
@@ -0,0 +1,85 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class CIFAR10Classification(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ name="CIFAR10",
+ description="Classifying images from 10 classes.",
+ reference="https://huggingface.co/datasets/uoft-cs/cifar10",
+ dataset={
+ "path": "uoft-cs/cifar10",
+ "revision": "0b2714987fa478483af9968de7c934580d0bb9a2",
+ },
+ type="ImageClassification",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2008-01-01",
+ "2009-01-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation=""" @TECHREPORT{Krizhevsky09learningmultiple,
+ author = {Alex Krizhevsky},
+ title = {Learning multiple layers of features from tiny images},
+ institution = {},
+ year = {2009}
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 10000},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
+ image_column_name: str = "img"
+
+
+class CIFAR100Classification(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ name="CIFAR100",
+ description="Classifying images from 100 classes.",
+ reference="https://huggingface.co/datasets/uoft-cs/cifar100",
+ dataset={
+ "path": "uoft-cs/cifar100",
+ "revision": "aadb3af77e9048adbea6b47c21a81e47dd092ae5",
+ },
+ type="ImageClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2008-01-01",
+ "2009-01-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation=""" @TECHREPORT{Krizhevsky09learningmultiple,
+ author = {Alex Krizhevsky},
+ title = {Learning multiple layers of features from tiny images},
+ institution = {},
+ year = {2009}
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 10000},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
+ image_column_name: str = "img"
+ label_column_name: str = "fine_label"
diff --git a/mteb/tasks/Image/ImageClassification/eng/Caltech101Classification.py b/mteb/tasks/Image/ImageClassification/eng/Caltech101Classification.py
new file mode 100644
index 0000000000..30112cdf1d
--- /dev/null
+++ b/mteb/tasks/Image/ImageClassification/eng/Caltech101Classification.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class Caltech101Classification(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ name="Caltech101",
+ description="Classifying images of 101 widely varied objects.",
+ reference="https://ieeexplore.ieee.org/document/1384978",
+ dataset={
+ "path": "HuggingFaceM4/Caltech-101",
+ "name": "with_background_category",
+ "revision": "851374102055782c84f89b1b4e9d128a6568847b",
+ "trust_remote_code": True,
+ },
+ type="ImageClassification",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2003-01-01",
+ "2004-01-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@INPROCEEDINGS{1384978,
+ author={Li Fei-Fei and Fergus, R. and Perona, P.},
+ booktitle={2004 Conference on Computer Vision and Pattern Recognition Workshop},
+ title={Learning Generative Visual Models from Few Training Examples: An Incremental Bayesian Approach Tested on 101 Object Categories},
+ year={2004},
+ volume={},
+ number={},
+ pages={178-178},
+ keywords={Bayesian methods;Testing;Humans;Maximum likelihood estimation;Assembly;Shape;Machine vision;Image recognition;Parameter estimation;Image databases},
+ doi={10.1109/CVPR.2004.383}}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 6084},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
diff --git a/mteb/tasks/Image/ImageClassification/eng/Country211Classification.py b/mteb/tasks/Image/ImageClassification/eng/Country211Classification.py
new file mode 100644
index 0000000000..b73f895595
--- /dev/null
+++ b/mteb/tasks/Image/ImageClassification/eng/Country211Classification.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class Country211Classification(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ name="Country211",
+ description="Classifying images of 211 countries.",
+ reference="https://huggingface.co/datasets/clip-benchmark/wds_country211",
+ dataset={
+ "path": "clip-benchmark/wds_country211",
+ "revision": "1699f138f0558342a1cbf99f7cf36b4361bb5ebc",
+ },
+ type="ImageClassification",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2020-01-01",
+ "2021-02-26",
+ ), # Estimated range for the collection of reviews
+ domains=["Scene"],
+ task_subtypes=["Scene recognition"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@article{radford2021learning,
+ title={Learning Transferable Visual Models From Natural Language Supervision},
+ author={Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and others},
+ journal={arXiv preprint arXiv:2103.00020},
+ year={2021}
+ }""",
+ descriptive_stats={
+ "n_samples": {"test": 21100},
+ "avg_character_length": {"test": 0},
+ },
+ )
+
+ image_column_name: str = "jpg"
+ label_column_name: str = "cls"
diff --git a/mteb/tasks/Image/ImageClassification/eng/DTDClassification.py b/mteb/tasks/Image/ImageClassification/eng/DTDClassification.py
new file mode 100644
index 0000000000..eb7360f088
--- /dev/null
+++ b/mteb/tasks/Image/ImageClassification/eng/DTDClassification.py
@@ -0,0 +1,42 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class DTDClassification(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ name="DTD",
+ description="Describable Textures Dataset in 47 categories.",
+ reference="https://www.robots.ox.ac.uk/~vgg/data/dtd/",
+ dataset={
+ "path": "tanganke/dtd",
+ "revision": "d2afa97d9f335b1a6b3b09c637aef667f98f966e",
+ },
+ type="ImageClassification",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2014-01-01",
+ "2014-03-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Textures recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@InProceedings{cimpoi14describing,
+ Author = {M. Cimpoi and S. Maji and I. Kokkinos and S. Mohamed and and A. Vedaldi},
+ Title = {Describing Textures in the Wild},
+ Booktitle = {Proceedings of the {IEEE} Conf. on Computer Vision and Pattern Recognition ({CVPR})},
+ Year = {2014}}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 1880},
+ "avg_character_length": {"test": 456},
+ },
+ )
diff --git a/mteb/tasks/Image/ImageClassification/eng/EuroSATClassification.py b/mteb/tasks/Image/ImageClassification/eng/EuroSATClassification.py
new file mode 100644
index 0000000000..5cac334c3d
--- /dev/null
+++ b/mteb/tasks/Image/ImageClassification/eng/EuroSATClassification.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class EuroSATClassification(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ name="EuroSAT",
+ description="Classifying satellite images.",
+ reference="https://ieeexplore.ieee.org/document/8736785",
+ dataset={
+ "path": "timm/eurosat-rgb",
+ "revision": "b4e28552cd5f3932b6abc37eb20d3e84901ad728",
+ },
+ type="ImageClassification",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2019-01-01",
+ "2019-03-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Scene recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@ARTICLE{8736785,
+ author={Helber, Patrick and Bischke, Benjamin and Dengel, Andreas and Borth, Damian},
+ journal={IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing},
+ title={EuroSAT: A Novel Dataset and Deep Learning Benchmark for Land Use and Land Cover Classification},
+ year={2019},
+ volume={12},
+ number={7},
+ pages={2217-2226},
+ keywords={Satellites;Earth;Remote sensing;Machine learning;Spatial resolution;Feature extraction;Benchmark testing;Dataset;deep convolutional neural network;deep learning;earth observation;land cover classification;land use classification;machine learning;remote sensing;satellite image classification;satellite images},
+ doi={10.1109/JSTARS.2019.2918242}}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 5400},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
diff --git a/mteb/tasks/Image/ImageClassification/eng/FER2013Classification.py b/mteb/tasks/Image/ImageClassification/eng/FER2013Classification.py
new file mode 100644
index 0000000000..074e92529a
--- /dev/null
+++ b/mteb/tasks/Image/ImageClassification/eng/FER2013Classification.py
@@ -0,0 +1,48 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class FER2013Classification(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ name="FER2013",
+ description="Classifying facial emotions.",
+ reference="https://arxiv.org/abs/1412.6572",
+ dataset={
+ "path": "clip-benchmark/wds_fer2013",
+ "revision": "9399b94167523fe5c40b3a857e24ef931ee4395b",
+ },
+ type="ImageClassification",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2014-01-01",
+ "2014-12-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Emotion recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@misc{goodfellow2015explainingharnessingadversarialexamples,
+ title={Explaining and Harnessing Adversarial Examples},
+ author={Ian J. Goodfellow and Jonathon Shlens and Christian Szegedy},
+ year={2015},
+ eprint={1412.6572},
+ archivePrefix={arXiv},
+ primaryClass={stat.ML},
+ url={https://arxiv.org/abs/1412.6572},
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 7178},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
+ image_column_name: str = "jpg"
+ label_column_name: str = "cls"
diff --git a/mteb/tasks/Image/ImageClassification/eng/FGVCAircraftClassification.py b/mteb/tasks/Image/ImageClassification/eng/FGVCAircraftClassification.py
new file mode 100644
index 0000000000..8b2a41bd50
--- /dev/null
+++ b/mteb/tasks/Image/ImageClassification/eng/FGVCAircraftClassification.py
@@ -0,0 +1,48 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class FGVCAircraftClassification(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ name="FGVCAircraft",
+ description="Classifying aircraft images from 41 manufacturers and 102 variants.",
+ reference="https://arxiv.org/abs/1306.5151",
+ dataset={
+ "path": "HuggingFaceM4/FGVC-Aircraft",
+ "revision": "91860adfc9a09aabca5cddb5247442109b38e213",
+ "trust_remote_code": True,
+ },
+ type="ImageClassification",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2009-01-01",
+ "2010-01-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@misc{maji2013finegrainedvisualclassificationaircraft,
+ title={Fine-Grained Visual Classification of Aircraft},
+ author={Subhransu Maji and Esa Rahtu and Juho Kannala and Matthew Blaschko and Andrea Vedaldi},
+ year={2013},
+ eprint={1306.5151},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV},
+ url={https://arxiv.org/abs/1306.5151},
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 3333},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
+ label_column_name: str = "variant" ## could be family, manufacturer, or variant. Variant has the higher number of classes.
diff --git a/mteb/tasks/Image/ImageClassification/eng/Food101Classification.py b/mteb/tasks/Image/ImageClassification/eng/Food101Classification.py
new file mode 100644
index 0000000000..1bbe8e106b
--- /dev/null
+++ b/mteb/tasks/Image/ImageClassification/eng/Food101Classification.py
@@ -0,0 +1,43 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class Food101Classification(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ name="Food101Classification",
+ description="Classifying food.",
+ reference="https://huggingface.co/datasets/ethz/food101",
+ dataset={
+ "path": "ethz/food101",
+ "revision": "e06acf2a88084f04bce4d4a525165d68e0a36c38",
+ },
+ type="ImageClassification",
+ category="i2i",
+ eval_splits=["validation"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2013-01-01",
+ "2014-01-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation=""" @inproceedings{bossard14,
+ title = {Food-101 -- Mining Discriminative Components with Random Forests},
+ author = {Bossard, Lukas and Guillaumin, Matthieu and Van Gool, Luc},
+ booktitle = {European Conference on Computer Vision},
+ year = {2014}
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"validation": 25300},
+ "avg_character_length": {"validation": 431.4},
+ },
+ )
diff --git a/mteb/tasks/Image/ImageClassification/eng/GTSRBClassification.py b/mteb/tasks/Image/ImageClassification/eng/GTSRBClassification.py
new file mode 100644
index 0000000000..3244b47dc8
--- /dev/null
+++ b/mteb/tasks/Image/ImageClassification/eng/GTSRBClassification.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class GTSRBClassification(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ name="GTSRB",
+ description="""The German Traffic Sign Recognition Benchmark (GTSRB) is a multi-class classification dataset for traffic signs. It consists of dataset of more than 50,000 traffic sign images. The dataset comprises 43 classes with unbalanced class frequencies.""",
+ reference="https://benchmark.ini.rub.de/",
+ dataset={
+ "path": "clip-benchmark/wds_gtsrb",
+ "revision": "1c13eff0803d2b02c9dc8dfe85e67770b3f0f3c5",
+ },
+ type="ImageClassification",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2011-01-01",
+ "2011-12-01",
+ ), # Estimated range for the collection of reviews
+ task_subtypes=["Activity recognition"],
+ domains=["Scene"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@INPROCEEDINGS{6033395,
+ author={Stallkamp, Johannes and Schlipsing, Marc and Salmen, Jan and Igel, Christian},
+ booktitle={The 2011 International Joint Conference on Neural Networks},
+ title={The German Traffic Sign Recognition Benchmark: A multi-class classification competition},
+ year={2011},
+ volume={},
+ number={},
+ pages={1453-1460},
+ keywords={Humans;Training;Image color analysis;Benchmark testing;Lead;Histograms;Image resolution},
+ doi={10.1109/IJCNN.2011.6033395}}
+""",
+ descriptive_stats={
+ "n_samples": {"test": 12630},
+ "avg_character_length": {"test": 0},
+ },
+ )
+ image_column_name = "webp"
+ label_column_name = "cls"
diff --git a/mteb/tasks/Image/ImageClassification/eng/Imagenet1k.py b/mteb/tasks/Image/ImageClassification/eng/Imagenet1k.py
new file mode 100644
index 0000000000..bed879d282
--- /dev/null
+++ b/mteb/tasks/Image/ImageClassification/eng/Imagenet1k.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class Imagenet1kClassification(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ name="Imagenet1k",
+ description="ImageNet, a large-scale ontology of images built upon the backbone of the WordNet structure.",
+ reference="https://ieeexplore.ieee.org/document/5206848",
+ dataset={
+ "path": "clip-benchmark/wds_imagenet1k",
+ "revision": "b24c7a5a3ef12df09089055d1795e2ce7c7e7397",
+ },
+ type="ImageClassification",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2010-01-01",
+ "2012-01-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Scene"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="human-annotated",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@article{deng2009imagenet,
+ title={ImageNet: A large-scale hierarchical image database},
+ author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li},
+ journal={2009 IEEE Conference on Computer Vision and Pattern Recognition},
+ pages={248--255},
+ year={2009},
+ organization={Ieee}
+ }""",
+ descriptive_stats={
+ "n_samples": {"test": 37200},
+ "avg_character_length": {"test": 0},
+ },
+ )
+ image_column_name: str = "jpg"
+ label_column_name: str = "cls"
diff --git a/mteb/tasks/Image/ImageClassification/eng/MNISTClassification.py b/mteb/tasks/Image/ImageClassification/eng/MNISTClassification.py
new file mode 100644
index 0000000000..8230938a14
--- /dev/null
+++ b/mteb/tasks/Image/ImageClassification/eng/MNISTClassification.py
@@ -0,0 +1,44 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class MNISTClassification(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ name="MNIST",
+ description="Classifying handwritten digits.",
+ reference="https://en.wikipedia.org/wiki/MNIST_database",
+ dataset={
+ "path": "ylecun/mnist",
+ "revision": "77f3279092a1c1579b2250db8eafed0ad422088c",
+ },
+ type="ImageClassification",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2010-01-01",
+ "2010-04-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@article{lecun2010mnist,
+ title={MNIST handwritten digit database},
+ author={LeCun, Yann and Cortes, Corinna and Burges, CJ},
+ journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},
+ volume={2},
+ year={2010}
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 10000},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
diff --git a/mteb/tasks/Image/ImageClassification/eng/OxfordFlowersClassification.py b/mteb/tasks/Image/ImageClassification/eng/OxfordFlowersClassification.py
new file mode 100644
index 0000000000..7f607d6aac
--- /dev/null
+++ b/mteb/tasks/Image/ImageClassification/eng/OxfordFlowersClassification.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class OxfordFlowersClassification(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ name="OxfordFlowersClassification",
+ description="Classifying flowers",
+ reference="https://huggingface.co/datasets/nelorth/oxford-flowers/viewer/default/train",
+ dataset={
+ "path": "nelorth/oxford-flowers",
+ "revision": "a37b1891609c0376fa81eced756e7863e1bd873b",
+ },
+ type="ImageClassification",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2012-01-01",
+ "2015-12-31",
+ ), # Estimated range for the collection of reviews
+ domains=["Reviews"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="found",
+ bibtex_citation="""d""",
+ descriptive_stats={
+ "n_samples": {"test": 400000},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
diff --git a/mteb/tasks/Image/ImageClassification/eng/OxfordPetsClassification.py b/mteb/tasks/Image/ImageClassification/eng/OxfordPetsClassification.py
new file mode 100644
index 0000000000..28a2357d5c
--- /dev/null
+++ b/mteb/tasks/Image/ImageClassification/eng/OxfordPetsClassification.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class OxfordPetsClassification(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ name="OxfordPets",
+ description="Classifying animal images.",
+ reference="https://arxiv.org/abs/1306.5151",
+ dataset={
+ "path": "isaacchung/OxfordPets",
+ "revision": "557b480fae8d69247be74d9503b378a09425096f",
+ },
+ type="ImageClassification",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2009-01-01",
+ "2010-01-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@misc{maji2013finegrainedvisualclassificationaircraft,
+ title={Fine-Grained Visual Classification of Aircraft},
+ author={Subhransu Maji and Esa Rahtu and Juho Kannala and Matthew Blaschko and Andrea Vedaldi},
+ year={2013},
+ eprint={1306.5151},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV},
+ url={https://arxiv.org/abs/1306.5151},
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 3669},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
diff --git a/mteb/tasks/Image/ImageClassification/eng/PatchCamelyonClassification.py b/mteb/tasks/Image/ImageClassification/eng/PatchCamelyonClassification.py
new file mode 100644
index 0000000000..27508c8c17
--- /dev/null
+++ b/mteb/tasks/Image/ImageClassification/eng/PatchCamelyonClassification.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class PatchCamelyonClassification(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ name="PatchCamelyon",
+ description="""Histopathology diagnosis classification dataset.""",
+ reference="https://link.springer.com/chapter/10.1007/978-3-030-00934-2_24",
+ dataset={
+ "path": "clip-benchmark/wds_vtab-pcam",
+ "revision": "502695fe1a141108650e3c5b91c8b5e0ff84ed49",
+ },
+ type="ImageClassification",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2018-01-01",
+ "2018-12-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Medical"],
+ task_subtypes=["Tumor detection"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@InProceedings{10.1007/978-3-030-00934-2_24,
+author="Veeling, Bastiaan S.
+and Linmans, Jasper
+and Winkens, Jim
+and Cohen, Taco
+and Welling, Max",
+editor="Frangi, Alejandro F.
+and Schnabel, Julia A.
+and Davatzikos, Christos
+and Alberola-L{\'o}pez, Carlos
+and Fichtinger, Gabor",
+title="Rotation Equivariant CNNs for Digital Pathology",
+booktitle="Medical Image Computing and Computer Assisted Intervention -- MICCAI 2018",
+year="2018",
+publisher="Springer International Publishing",
+address="Cham",
+pages="210--218",
+abstract="We propose a new model for digital pathology segmentation, based on the observation that histopathology images are inherently symmetric under rotation and reflection. Utilizing recent findings on rotation equivariant CNNs, the proposed model leverages these symmetries in a principled manner. We present a visual analysis showing improved stability on predictions, and demonstrate that exploiting rotation equivariance significantly improves tumor detection performance on a challenging lymph node metastases dataset. We further present a novel derived dataset to enable principled comparison of machine learning models, in combination with an initial benchmark. Through this dataset, the task of histopathology diagnosis becomes accessible as a challenging benchmark for fundamental machine learning research.",
+isbn="978-3-030-00934-2"
+}
+""",
+ descriptive_stats={
+ "n_samples": {"test": 32768},
+ "avg_character_length": {"test": 0},
+ },
+ )
+ image_column_name = "webp"
+ label_column_name = "cls"
diff --git a/mteb/tasks/Image/ImageClassification/eng/RESISC45Classification.py b/mteb/tasks/Image/ImageClassification/eng/RESISC45Classification.py
new file mode 100644
index 0000000000..7fa7cd5d3d
--- /dev/null
+++ b/mteb/tasks/Image/ImageClassification/eng/RESISC45Classification.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class RESISC45Classification(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ name="RESISC45",
+ description="Remote Sensing Image Scene Classification by Northwestern Polytechnical University (NWPU).",
+ reference="https://ieeexplore.ieee.org/abstract/document/7891544",
+ dataset={
+ "path": "timm/resisc45",
+ "revision": "fe12fc5f1b7606543b0355eda392f1ddc54625c6",
+ },
+ type="ImageClassification",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2017-01-01",
+ "2017-03-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@ARTICLE{7891544,
+ author={Cheng, Gong and Han, Junwei and Lu, Xiaoqiang},
+ journal={Proceedings of the IEEE},
+ title={Remote Sensing Image Scene Classification: Benchmark and State of the Art},
+ year={2017},
+ volume={105},
+ number={10},
+ pages={1865-1883},
+ keywords={Remote sensing;Benchmark testing;Spatial resolution;Social network services;Satellites;Image analysis;Machine learning;Unsupervised learning;Classification;Benchmark data set;deep learning;handcrafted features;remote sensing image;scene classification;unsupervised feature learning},
+ doi={10.1109/JPROC.2017.2675998}}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 6300},
+ "avg_character_length": {"test": 256},
+ },
+ )
diff --git a/mteb/tasks/Image/ImageClassification/eng/STL10Classification.py b/mteb/tasks/Image/ImageClassification/eng/STL10Classification.py
new file mode 100644
index 0000000000..11ea833477
--- /dev/null
+++ b/mteb/tasks/Image/ImageClassification/eng/STL10Classification.py
@@ -0,0 +1,52 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class STL10Classification(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ name="STL10",
+ description="Classifying 96x96 images from 10 classes.",
+ reference="https://cs.stanford.edu/~acoates/stl10/",
+ dataset={
+ "path": "tanganke/stl10",
+ "revision": "49ae7f94508f7feae62baf836db284306eab0b0f",
+ },
+ type="ImageClassification",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2011-01-01",
+ "2011-04-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@InProceedings{pmlr-v15-coates11a,
+ title = {An Analysis of Single-Layer Networks in Unsupervised Feature Learning},
+ author = {Coates, Adam and Ng, Andrew and Lee, Honglak},
+ booktitle = {Proceedings of the Fourteenth International Conference on Artificial Intelligence and Statistics},
+ pages = {215--223},
+ year = {2011},
+ editor = {Gordon, Geoffrey and Dunson, David and Dudík, Miroslav},
+ volume = {15},
+ series = {Proceedings of Machine Learning Research},
+ address = {Fort Lauderdale, FL, USA},
+ month = {11--13 Apr},
+ publisher = {PMLR},
+ pdf = {http://proceedings.mlr.press/v15/coates11a/coates11a.pdf},
+ url = {https://proceedings.mlr.press/v15/coates11a.html},
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 8000},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
diff --git a/mteb/tasks/Image/ImageClassification/eng/SUN397Classification.py b/mteb/tasks/Image/ImageClassification/eng/SUN397Classification.py
new file mode 100644
index 0000000000..b4b5a8b931
--- /dev/null
+++ b/mteb/tasks/Image/ImageClassification/eng/SUN397Classification.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class SUN397Classification(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ name="SUN397",
+ description="Large scale scene recognition in 397 categories.",
+ reference="https://ieeexplore.ieee.org/abstract/document/5539970",
+ dataset={
+ "path": "dpdl-benchmark/sun397",
+ "revision": "7e6af6a2499ad708618be868e1471eac0aca1168",
+ },
+ type="ImageClassification",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2017-01-01",
+ "2017-03-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Scene recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@INPROCEEDINGS{5539970,
+ author={Xiao, Jianxiong and Hays, James and Ehinger, Krista A. and Oliva, Aude and Torralba, Antonio},
+ booktitle={2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition},
+ title={SUN database: Large-scale scene recognition from abbey to zoo},
+ year={2010},
+ volume={},
+ number={},
+ pages={3485-3492},
+ doi={10.1109/CVPR.2010.5539970}}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 21750},
+ "avg_character_length": {"test": 256},
+ },
+ )
diff --git a/mteb/tasks/Image/ImageClassification/eng/StanfordCarsClassification.py b/mteb/tasks/Image/ImageClassification/eng/StanfordCarsClassification.py
new file mode 100644
index 0000000000..74fa5e92b8
--- /dev/null
+++ b/mteb/tasks/Image/ImageClassification/eng/StanfordCarsClassification.py
@@ -0,0 +1,43 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class StanfordCarsClassification(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ name="StanfordCars",
+ description="Classifying car images from 196 makes.",
+ reference="https://pure.mpg.de/rest/items/item_2029263/component/file_2029262/content",
+ dataset={
+ "path": "isaacchung/StanfordCars",
+ "revision": "09ffe9bc7864d3f1e851529e5c4b7e05601a04fb",
+ },
+ type="ImageClassification",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2013-01-01",
+ "2013-04-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{Krause2013CollectingAL,
+ title={Collecting a Large-scale Dataset of Fine-grained Cars},
+ author={Jonathan Krause and Jia Deng and Michael Stark and Li Fei-Fei},
+ year={2013},
+ url={https://api.semanticscholar.org/CorpusID:16632981}
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 8041},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
diff --git a/mteb/tasks/Image/ImageClassification/eng/UCF101Classification.py b/mteb/tasks/Image/ImageClassification/eng/UCF101Classification.py
new file mode 100644
index 0000000000..dc4021b490
--- /dev/null
+++ b/mteb/tasks/Image/ImageClassification/eng/UCF101Classification.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class UCF101Classification(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ name="UCF101",
+ description="""UCF101 is an action recognition data set of realistic
+action videos collected from YouTube, having 101 action categories. This
+version of the dataset does not contain images but images saved frame by
+frame. Train and test splits are generated based on the authors' first
+version train/test list.""",
+ reference="https://huggingface.co/datasets/flwrlabs/ucf101",
+ dataset={
+ "path": "flwrlabs/ucf101",
+ "revision": "1098eed48f2929443f47c39f3b5c814e16369c11",
+ },
+ type="ImageClassification",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2012-01-01",
+ "2012-12-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Scene"],
+ task_subtypes=["Activity recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@misc{soomro2012ucf101dataset101human,
+ title={UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild},
+ author={Khurram Soomro and Amir Roshan Zamir and Mubarak Shah},
+ year={2012},
+ eprint={1212.0402},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV},
+ url={https://arxiv.org/abs/1212.0402},
+}""",
+ descriptive_stats={
+ "n_samples": {"test": 697222},
+ "avg_character_length": {"test": 0},
+ },
+ )
diff --git a/mteb/tasks/Image/ImageClassification/eng/__init__.py b/mteb/tasks/Image/ImageClassification/eng/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/mteb/tasks/Image/ImageMultilabelClassification/__init__.py b/mteb/tasks/Image/ImageMultilabelClassification/__init__.py
new file mode 100644
index 0000000000..844f19a14c
--- /dev/null
+++ b/mteb/tasks/Image/ImageMultilabelClassification/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from .eng.PascalVOC2007 import *
diff --git a/mteb/tasks/Image/ImageMultilabelClassification/eng/PascalVOC2007.py b/mteb/tasks/Image/ImageMultilabelClassification/eng/PascalVOC2007.py
new file mode 100644
index 0000000000..ce32f85f93
--- /dev/null
+++ b/mteb/tasks/Image/ImageMultilabelClassification/eng/PascalVOC2007.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageMultilabelClassification import (
+ AbsTaskImageMultilabelClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class VOC2007Classification(AbsTaskImageMultilabelClassification):
+ metadata = TaskMetadata(
+ name="VOC2007",
+ description="Classifying bird images from 500 species.",
+ reference="http://host.robots.ox.ac.uk/pascal/VOC/",
+ dataset={
+ "path": "HuggingFaceM4/pascal_voc",
+ "name": "voc2007_main",
+ "revision": "dbafdb9e1506c9c419c5c4672e409463cd21ba50",
+ "trust_remote_code": True,
+ },
+ type="ImageMultilabelClassification",
+ category="i2i",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="lrap",
+ date=(
+ "2005-01-01",
+ "2014-01-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@Article{Everingham10,
+ author = "Everingham, M. and Van~Gool, L. and Williams, C. K. I. and Winn, J. and Zisserman, A.",
+ title = "The Pascal Visual Object Classes (VOC) Challenge",
+ journal = "International Journal of Computer Vision",
+ volume = "88",
+ year = "2010",
+ number = "2",
+ month = jun,
+ pages = "303--338",
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 4952},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
+
+ # Override default column name in the subclass
+ label_column_name: str = "classes"
+
+ # To be removed when we want full results
+ n_experiments: int = 5
diff --git a/mteb/tasks/Image/ImageMultilabelClassification/eng/__init__.py b/mteb/tasks/Image/ImageMultilabelClassification/eng/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/mteb/tasks/Image/ImageTextPairClassification/AROCocoOrder.py b/mteb/tasks/Image/ImageTextPairClassification/AROCocoOrder.py
new file mode 100644
index 0000000000..ac5f03127e
--- /dev/null
+++ b/mteb/tasks/Image/ImageTextPairClassification/AROCocoOrder.py
@@ -0,0 +1,55 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageTextPairClassification import (
+ AbsTaskImageTextPairClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class AROCocoOrder(AbsTaskImageTextPairClassification):
+ images_column_names = ["images"]
+ texts_column_names = [
+ "correct_caption",
+ "hard_text_1",
+ "hard_text_2",
+ "hard_text_3",
+ "hard_text_4",
+ ]
+
+ metadata = TaskMetadata(
+ name="AROCocoOrder",
+ description="Compositionality Evaluation of images to their captions."
+ + "Each capation has four hard negatives created by order permutations.",
+ reference="https://proceedings.neurips.cc/paper_files/paper/2023/hash/63461de0b4cb760fc498e85b18a7fe81-Abstract-Datasets_and_Benchmarks.html",
+ dataset={
+ "path": "gowitheflow/ARO-COCO-order",
+ "revision": "853ec8757226585a38a80886c51fe0f3f268787c",
+ },
+ type="ImageTextPairClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="text_acc",
+ date=(
+ "2022-01-01",
+ "2022-12-31",
+ ), # Estimated range for the collection of data
+ domains=["Encyclopaedic"],
+ task_subtypes=["Caption Pairing"],
+ license="mit",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="created",
+ bibtex_citation="""@article{hsieh2024sugarcrepe,
+ title={Sugarcrepe: Fixing hackable benchmarks for vision-language compositionality},
+ author={Hsieh, Cheng-Yu and Zhang, Jieyu and Ma, Zixian and Kembhavi, Aniruddha and Krishna, Ranjay},
+ journal={Advances in neural information processing systems},
+ volume={36},
+ year={2024}
+}""",
+ descriptive_stats={
+ "n_samples": {"test": 25010},
+ "avg_character_length": {"test": 1},
+ },
+ )
diff --git a/mteb/tasks/Image/ImageTextPairClassification/AROFlickrOrder.py b/mteb/tasks/Image/ImageTextPairClassification/AROFlickrOrder.py
new file mode 100644
index 0000000000..18faadaf23
--- /dev/null
+++ b/mteb/tasks/Image/ImageTextPairClassification/AROFlickrOrder.py
@@ -0,0 +1,55 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageTextPairClassification import (
+ AbsTaskImageTextPairClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class AROFlickrOrder(AbsTaskImageTextPairClassification):
+ images_column_names = ["images"]
+ texts_column_names = [
+ "correct_caption",
+ "hard_text_1",
+ "hard_text_2",
+ "hard_text_3",
+ "hard_text_4",
+ ]
+
+ metadata = TaskMetadata(
+ name="AROFlickrOrder",
+ description="Compositionality Evaluation of images to their captions."
+ + "Each capation has four hard negatives created by order permutations.",
+ reference="https://proceedings.neurips.cc/paper_files/paper/2023/hash/63461de0b4cb760fc498e85b18a7fe81-Abstract-Datasets_and_Benchmarks.html",
+ dataset={
+ "path": "gowitheflow/ARO-Flickr-Order",
+ "revision": "1f9485f69c87947812378a1aedf86410c86a0aa8",
+ },
+ type="ImageTextPairClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="text_acc",
+ date=(
+ "2022-01-01",
+ "2022-12-31",
+ ), # Estimated range for the collection of data
+ domains=["Encyclopaedic"],
+ task_subtypes=["Caption Pairing"],
+ license="mit",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="created",
+ bibtex_citation="""@article{hsieh2024sugarcrepe,
+ title={Sugarcrepe: Fixing hackable benchmarks for vision-language compositionality},
+ author={Hsieh, Cheng-Yu and Zhang, Jieyu and Ma, Zixian and Kembhavi, Aniruddha and Krishna, Ranjay},
+ journal={Advances in neural information processing systems},
+ volume={36},
+ year={2024}
+}""",
+ descriptive_stats={
+ "n_samples": {"test": 5000},
+ "avg_character_length": {"test": 1},
+ },
+ )
diff --git a/mteb/tasks/Image/ImageTextPairClassification/AROVisualAttribution.py b/mteb/tasks/Image/ImageTextPairClassification/AROVisualAttribution.py
new file mode 100644
index 0000000000..4f75db410b
--- /dev/null
+++ b/mteb/tasks/Image/ImageTextPairClassification/AROVisualAttribution.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageTextPairClassification import (
+ AbsTaskImageTextPairClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class AROVisualAttribution(AbsTaskImageTextPairClassification):
+ images_column_names = ["image"]
+ texts_column_names = ["true_caption", "false_caption"]
+
+ metadata = TaskMetadata(
+ name="AROVisualAttribution",
+ description="Compositionality Evaluation of images to their captions.",
+ reference="https://openreview.net/forum?id=KRLUvxh8uaX",
+ dataset={
+ "path": "gowitheflow/ARO-Visual-Attribution",
+ "revision": "18f7e01358d91df599d723f00e16a18640e19398",
+ },
+ type="ImageTextPairClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="text_acc",
+ date=(
+ "2022-01-01",
+ "2022-12-31",
+ ), # Estimated range for the collection of data
+ domains=["Encyclopaedic"],
+ task_subtypes=["Caption Pairing"],
+ license="mit",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{yuksekgonul2023and,
+ title={When and why vision-language models behave like bags-of-words, and what to do about it?},
+ author={Yuksekgonul, Mert and Bianchi, Federico and Kalluri, Pratyusha and Jurafsky, Dan and Zou, James},
+ booktitle={The Eleventh International Conference on Learning Representations},
+ year={2023}
+}""",
+ descriptive_stats={
+ "n_samples": {"test": 28748},
+ "avg_character_length": {"test": 1},
+ },
+ )
diff --git a/mteb/tasks/Image/ImageTextPairClassification/AROVisualRelation.py b/mteb/tasks/Image/ImageTextPairClassification/AROVisualRelation.py
new file mode 100644
index 0000000000..fef938271e
--- /dev/null
+++ b/mteb/tasks/Image/ImageTextPairClassification/AROVisualRelation.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageTextPairClassification import (
+ AbsTaskImageTextPairClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class AROVisualRelation(AbsTaskImageTextPairClassification):
+ images_column_names = ["image"]
+ texts_column_names = ["true_caption", "false_caption"]
+
+ metadata = TaskMetadata(
+ name="AROVisualRelation",
+ description="Compositionality Evaluation of images to their captions.",
+ reference="https://openreview.net/forum?id=KRLUvxh8uaX",
+ dataset={
+ "path": "gowitheflow/ARO-Visual-Relation",
+ "revision": "3867ad4f46a1ac2e63be034d1fc77dd8c2ef7209",
+ },
+ type="ImageTextPairClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="text_acc",
+ date=(
+ "2022-01-01",
+ "2022-12-31",
+ ), # Estimated range for the collection of data
+ domains=["Encyclopaedic"],
+ task_subtypes=["Caption Pairing"],
+ license="mit",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{yuksekgonul2023and,
+ title={When and why vision-language models behave like bags-of-words, and what to do about it?},
+ author={Yuksekgonul, Mert and Bianchi, Federico and Kalluri, Pratyusha and Jurafsky, Dan and Zou, James},
+ booktitle={The Eleventh International Conference on Learning Representations},
+ year={2023}
+}""",
+ descriptive_stats={
+ "n_samples": {"test": 23937},
+ "avg_character_length": {"test": 1},
+ },
+ )
diff --git a/mteb/tasks/Image/ImageTextPairClassification/SugarCrepe.py b/mteb/tasks/Image/ImageTextPairClassification/SugarCrepe.py
new file mode 100644
index 0000000000..b410cbacd5
--- /dev/null
+++ b/mteb/tasks/Image/ImageTextPairClassification/SugarCrepe.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+import datasets
+
+from mteb.abstasks.Image.AbsTaskImageTextPairClassification import (
+ AbsTaskImageTextPairClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class SugarCrepe(AbsTaskImageTextPairClassification):
+ images_column_names = ["images"]
+ texts_column_names = ["caption", "negative_caption"]
+
+ metadata = TaskMetadata(
+ name="SugarCrepe",
+ description="Compositionality Evaluation of images to their captions.",
+ reference="https://proceedings.neurips.cc/paper_files/paper/2023/hash/63461de0b4cb760fc498e85b18a7fe81-Abstract-Datasets_and_Benchmarks.html",
+ dataset={
+ "path": "yjkimstats/SUGARCREPE_fmt",
+ "revision": "134abf9ade6a32f9fdae0e89022ff227a70b87e5",
+ },
+ type="ImageTextPairClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="text_acc",
+ date=(
+ "2022-01-01",
+ "2022-12-31",
+ ), # Estimated range for the collection of data
+ domains=["Encyclopaedic"],
+ task_subtypes=["Caption Pairing"],
+ license="mit",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="created",
+ bibtex_citation="""@article{hsieh2024sugarcrepe,
+ title={Sugarcrepe: Fixing hackable benchmarks for vision-language compositionality},
+ author={Hsieh, Cheng-Yu and Zhang, Jieyu and Ma, Zixian and Kembhavi, Aniruddha and Krishna, Ranjay},
+ journal={Advances in neural information processing systems},
+ volume={36},
+ year={2024}
+}""",
+ descriptive_stats={
+ "n_samples": {"test": 7511},
+ "avg_character_length": {"test": 1},
+ },
+ )
+
+ def load_data(self, **kwargs):
+ """Load dataset from HuggingFace hub"""
+ if self.data_loaded:
+ return
+ self.dataset = datasets.load_dataset(**self.metadata_dict["dataset"]) # type: ignore
+ self.dataset = datasets.DatasetDict({"test": self.dataset["train"]})
+ self.dataset_transform()
+ self.data_loaded = True
diff --git a/mteb/tasks/Image/ImageTextPairClassification/Winoground.py b/mteb/tasks/Image/ImageTextPairClassification/Winoground.py
new file mode 100644
index 0000000000..6169182286
--- /dev/null
+++ b/mteb/tasks/Image/ImageTextPairClassification/Winoground.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskImageTextPairClassification import (
+ AbsTaskImageTextPairClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class Winoground(AbsTaskImageTextPairClassification):
+ images_column_names = ["image_0", "image_1"]
+ texts_column_names = ["caption_0", "caption_1"]
+
+ metadata = TaskMetadata(
+ name="Winoground",
+ description="Compositionality Evaluation of images to their captions.",
+ reference="https://openaccess.thecvf.com/content/CVPR2022/html/Thrush_Winoground_Probing_Vision_and_Language_Models_for_Visio-Linguistic_Compositionality_CVPR_2022_paper",
+ dataset={
+ "path": "facebook/winoground",
+ "revision": "b400e173549071916ad1b3d449293bc8d8b4b763",
+ },
+ type="ImageTextPairClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2022-01-01",
+ "2022-04-07",
+ ), # Estimated range for the collection of data
+ domains=["Social"], # Getty Images. Could be constructed?
+ task_subtypes=["Caption Pairing"],
+ license="https://huggingface.co/datasets/facebook/winoground/blob/main/license_agreement.txt",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="created",
+ bibtex_citation="""@misc{thrush2022winogroundprobingvisionlanguage,
+ title={Winoground: Probing Vision and Language Models for Visio-Linguistic Compositionality},
+ author={Tristan Thrush and Ryan Jiang and Max Bartolo and Amanpreet Singh and Adina Williams and Douwe Kiela and Candace Ross},
+ year={2022},
+ eprint={2204.03162},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV},
+ url={https://arxiv.org/abs/2204.03162},
+ }""",
+ descriptive_stats={
+ "n_samples": {"test": 400},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
diff --git a/mteb/tasks/Image/ImageTextPairClassification/__init__.py b/mteb/tasks/Image/ImageTextPairClassification/__init__.py
new file mode 100644
index 0000000000..69f0a9fbc1
--- /dev/null
+++ b/mteb/tasks/Image/ImageTextPairClassification/__init__.py
@@ -0,0 +1,8 @@
+from __future__ import annotations
+
+from .AROCocoOrder import *
+from .AROFlickrOrder import *
+from .AROVisualAttribution import *
+from .AROVisualRelation import *
+from .SugarCrepe import *
+from .Winoground import *
diff --git a/mteb/tasks/Image/VisualSTS/__init__.py b/mteb/tasks/Image/VisualSTS/__init__.py
new file mode 100644
index 0000000000..eb785d5d85
--- /dev/null
+++ b/mteb/tasks/Image/VisualSTS/__init__.py
@@ -0,0 +1,9 @@
+from __future__ import annotations
+
+from .en.STS12VisualSTS import *
+from .en.STS13VisualSTS import *
+from .en.STS14VisualSTS import *
+from .en.STS15VisualSTS import *
+from .en.STS16VisualSTS import *
+from .multilingual.STS17MultilingualVisualSTS import *
+from .multilingual.STSBenchmarkMultilingualVisualSTS import *
diff --git a/mteb/tasks/Image/VisualSTS/en/STS12VisualSTS.py b/mteb/tasks/Image/VisualSTS/en/STS12VisualSTS.py
new file mode 100644
index 0000000000..09d550547f
--- /dev/null
+++ b/mteb/tasks/Image/VisualSTS/en/STS12VisualSTS.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class STS12VisualSTS(AbsTaskVisualSTS):
+ metadata = TaskMetadata(
+ name="STS12VisualSTS",
+ dataset={
+ "path": "Pixel-Linguist/rendered-sts12",
+ "revision": "820c25edfba736f3789201b2476208cc62c2ccb9",
+ },
+ description="SemEval-2012 Task 6." + "then rendered into images.",
+ reference="https://arxiv.org/abs/2402.08183/",
+ type="VisualSTS",
+ category="i2i",
+ modalities=["image"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cosine_spearman",
+ date=("2005-01-01", "2012-12-31"),
+ domains=["Encyclopaedic", "News", "Written"],
+ task_subtypes=[],
+ license="not specified",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="rendered",
+ bibtex_citation="""@article{xiao2024pixel,
+ title={Pixel Sentence Representation Learning},
+ author={Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al},
+ journal={arXiv preprint arXiv:2402.08183},
+ year={2024}
+}""",
+ descriptive_stats={
+ "n_samples": {"test": 5342},
+ "avg_character_length": {"dev": 1.0, "test": 1.0},
+ },
+ )
+
+ @property
+ def metadata_dict(self) -> dict[str, str]:
+ metadata_dict = super().metadata_dict
+ metadata_dict["min_score"] = 0
+ metadata_dict["max_score"] = 5
+ return metadata_dict
diff --git a/mteb/tasks/Image/VisualSTS/en/STS13VisualSTS.py b/mteb/tasks/Image/VisualSTS/en/STS13VisualSTS.py
new file mode 100644
index 0000000000..771e9e0ce8
--- /dev/null
+++ b/mteb/tasks/Image/VisualSTS/en/STS13VisualSTS.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class STS13VisualSTS(AbsTaskVisualSTS):
+ metadata = TaskMetadata(
+ name="STS13VisualSTS",
+ dataset={
+ "path": "Pixel-Linguist/rendered-sts13",
+ "revision": "561ee9ca47ff3e4a657283c59416deca8dc169f2",
+ },
+ description="SemEval STS 2013 dataset." + "then rendered into images.",
+ reference="https://arxiv.org/abs/2402.08183/",
+ type="VisualSTS",
+ category="i2i",
+ modalities=["image"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cosine_spearman",
+ date=("2012-01-01", "2012-12-31"),
+ domains=["Web", "News", "Non-fiction", "Written"],
+ task_subtypes=[],
+ license="not specified",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="rendered",
+ bibtex_citation="""@article{xiao2024pixel,
+ title={Pixel Sentence Representation Learning},
+ author={Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al},
+ journal={arXiv preprint arXiv:2402.08183},
+ year={2024}
+}""",
+ descriptive_stats={
+ "n_samples": {"test": 1500},
+ "avg_character_length": {"dev": 1.0, "test": 1.0},
+ },
+ )
+
+ @property
+ def metadata_dict(self) -> dict[str, str]:
+ metadata_dict = super().metadata_dict
+ metadata_dict["min_score"] = 0
+ metadata_dict["max_score"] = 5
+ return metadata_dict
diff --git a/mteb/tasks/Image/VisualSTS/en/STS14VisualSTS.py b/mteb/tasks/Image/VisualSTS/en/STS14VisualSTS.py
new file mode 100644
index 0000000000..299e54dca9
--- /dev/null
+++ b/mteb/tasks/Image/VisualSTS/en/STS14VisualSTS.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class STS14VisualSTS(AbsTaskVisualSTS):
+ metadata = TaskMetadata(
+ name="STS14VisualSTS",
+ dataset={
+ "path": "Pixel-Linguist/rendered-sts14",
+ "revision": "824e95e45471024a684b901e0645579ffd9ca288",
+ },
+ description="SemEval STS 2014 dataset. Currently only the English dataset."
+ + "rendered into images.",
+ reference="https://arxiv.org/abs/2402.08183/",
+ type="VisualSTS",
+ category="i2i",
+ modalities=["image"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cosine_spearman",
+ date=("2012-01-01", "2012-08-31"),
+ domains=["Blog", "Web", "Spoken"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="rendered",
+ bibtex_citation="""@article{xiao2024pixel,
+ title={Pixel Sentence Representation Learning},
+ author={Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al},
+ journal={arXiv preprint arXiv:2402.08183},
+ year={2024}
+}""",
+ descriptive_stats={
+ "n_samples": {"test": 3750},
+ "avg_character_length": {"dev": 1.0, "test": 1.0},
+ },
+ )
+
+ @property
+ def metadata_dict(self) -> dict[str, str]:
+ metadata_dict = super().metadata_dict
+ metadata_dict["min_score"] = 0
+ metadata_dict["max_score"] = 5
+ return metadata_dict
diff --git a/mteb/tasks/Image/VisualSTS/en/STS15VisualSTS.py b/mteb/tasks/Image/VisualSTS/en/STS15VisualSTS.py
new file mode 100644
index 0000000000..1756cdc55c
--- /dev/null
+++ b/mteb/tasks/Image/VisualSTS/en/STS15VisualSTS.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class STS15VisualSTS(AbsTaskVisualSTS):
+ metadata = TaskMetadata(
+ name="STS15VisualSTS",
+ dataset={
+ "path": "Pixel-Linguist/rendered-sts15",
+ "revision": "1f8d08d9b9daac7118dfdefeb94b0aac4baf2e5f",
+ },
+ description="SemEval STS 2015 dataset" + "rendered into images.",
+ reference="https://arxiv.org/abs/2402.08183/",
+ type="VisualSTS",
+ category="i2i",
+ modalities=["image"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cosine_spearman",
+ date=("2008-01-01", "2014-07-28"),
+ domains=["Blog", "News", "Web", "Written", "Spoken"],
+ task_subtypes=[],
+ license="not specified",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="rendered",
+ bibtex_citation="""@article{xiao2024pixel,
+ title={Pixel Sentence Representation Learning},
+ author={Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al},
+ journal={arXiv preprint arXiv:2402.08183},
+ year={2024}
+}""",
+ descriptive_stats={
+ "n_samples": {"test": 3000},
+ "avg_character_length": {"dev": 1.0, "test": 1.0},
+ },
+ )
+
+ @property
+ def metadata_dict(self) -> dict[str, str]:
+ metadata_dict = super().metadata_dict
+ metadata_dict["min_score"] = 0
+ metadata_dict["max_score"] = 5
+ return metadata_dict
diff --git a/mteb/tasks/Image/VisualSTS/en/STS16VisualSTS.py b/mteb/tasks/Image/VisualSTS/en/STS16VisualSTS.py
new file mode 100644
index 0000000000..dba6e4af63
--- /dev/null
+++ b/mteb/tasks/Image/VisualSTS/en/STS16VisualSTS.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class STS16VisualSTS(AbsTaskVisualSTS):
+ metadata = TaskMetadata(
+ name="STS16VisualSTS",
+ dataset={
+ "path": "Pixel-Linguist/rendered-sts16",
+ "revision": "fc354f19598af93f32c0af1b94046ffdeaacde15",
+ },
+ description="SemEval STS 2016 dataset" + "rendered into images.",
+ reference="https://arxiv.org/abs/2402.08183/",
+ type="VisualSTS",
+ category="i2i",
+ modalities=["image"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="cosine_spearman",
+ date=("2015-10-01", "2015-12-31"),
+ domains=["Blog", "Web", "Spoken"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="rendered",
+ bibtex_citation="""@article{xiao2024pixel,
+ title={Pixel Sentence Representation Learning},
+ author={Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al},
+ journal={arXiv preprint arXiv:2402.08183},
+ year={2024}
+}""",
+ descriptive_stats={
+ "n_samples": {"test": 1186},
+ "avg_character_length": {"dev": 1.0, "test": 1.0},
+ },
+ )
+
+ @property
+ def metadata_dict(self) -> dict[str, str]:
+ metadata_dict = super().metadata_dict
+ metadata_dict["min_score"] = 0
+ metadata_dict["max_score"] = 5
+ return metadata_dict
diff --git a/mteb/tasks/Image/VisualSTS/en/__init__.py b/mteb/tasks/Image/VisualSTS/en/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/mteb/tasks/Image/VisualSTS/multilingual/STS17MultilingualVisualSTS.py b/mteb/tasks/Image/VisualSTS/multilingual/STS17MultilingualVisualSTS.py
new file mode 100644
index 0000000000..068fd33b9c
--- /dev/null
+++ b/mteb/tasks/Image/VisualSTS/multilingual/STS17MultilingualVisualSTS.py
@@ -0,0 +1,66 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS
+from mteb.abstasks.MultilingualTask import MultilingualTask
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+_LANGUAGES = {
+ "ko-ko": ["kor-Hang"],
+ "ar-ar": ["ara-Arab"],
+ "en-ar": ["eng-Latn", "ara-Arab"],
+ "en-de": ["eng-Latn", "deu-Latn"],
+ "en-en": ["eng-Latn"],
+ "en-tr": ["eng-Latn", "tur-Latn"],
+ "es-en": ["spa-Latn", "eng-Latn"],
+ "es-es": ["spa-Latn"],
+ "fr-en": ["fra-Latn", "eng-Latn"],
+ "it-en": ["ita-Latn", "eng-Latn"],
+ "nl-en": ["nld-Latn", "eng-Latn"],
+}
+
+_SPLITS = ["test"]
+
+
+class STS17MultilingualVisualSTS(AbsTaskVisualSTS, MultilingualTask):
+ metadata = TaskMetadata(
+ name="STS17MultilingualVisualSTS",
+ dataset={
+ "path": "Pixel-Linguist/rendered-sts17",
+ "revision": "2e31b4b459551a51e1ab54fd7266b40f3fe510d4",
+ },
+ description=(
+ "Semantic Textual Similarity 17 (STS-17) dataset, "
+ + "rendered into images."
+ ),
+ reference="https://arxiv.org/abs/2402.08183/",
+ type="VisualSTS",
+ category="i2i",
+ modalities=["image"],
+ eval_splits=_SPLITS,
+ eval_langs=_LANGUAGES,
+ main_score="cosine_spearman",
+ date=("2012-01-01", "2017-12-31"),
+ domains=["News", "Social", "Web", "Spoken", "Written"],
+ task_subtypes=[],
+ license="not specified",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="rendered",
+ bibtex_citation="""@article{xiao2024pixel,
+ title={Pixel Sentence Representation Learning},
+ author={Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al},
+ journal={arXiv preprint arXiv:2402.08183},
+ year={2024}
+}""",
+ descriptive_stats={
+ "n_samples": {"test": 10692},
+ "avg_character_length": {"dev": 1.0, "test": 1.0},
+ },
+ )
+
+ @property
+ def metadata_dict(self) -> dict[str, str]:
+ metadata_dict = super().metadata_dict
+ metadata_dict["min_score"] = 0
+ metadata_dict["max_score"] = 5
+ return metadata_dict
diff --git a/mteb/tasks/Image/VisualSTS/multilingual/STSBenchmarkMultilingualVisualSTS.py b/mteb/tasks/Image/VisualSTS/multilingual/STSBenchmarkMultilingualVisualSTS.py
new file mode 100644
index 0000000000..ce8c047655
--- /dev/null
+++ b/mteb/tasks/Image/VisualSTS/multilingual/STSBenchmarkMultilingualVisualSTS.py
@@ -0,0 +1,67 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS
+from mteb.abstasks.MultilingualTask import MultilingualTask
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+_LANGUAGES = {
+ "en": ["eng-Latn"],
+ "de": ["deu-Latn"],
+ "es": ["spa-Latn"],
+ "fr": ["fra-Latn"],
+ "it": ["ita-Latn"],
+ "nl": ["nld-Latn"],
+ "pl": ["pol-Latn"],
+ "pt": ["por-Latn"],
+ "ru": ["rus-Cyrl"],
+ "zh": ["cmn-Hans"],
+}
+
+_SPLITS = ["dev", "test"]
+
+
+class STSBenchmarkMultilingualVisualSTS(AbsTaskVisualSTS, MultilingualTask):
+ metadata = TaskMetadata(
+ name="STSBenchmarkMultilingualVisualSTS",
+ dataset={
+ "path": "Pixel-Linguist/rendered-stsb",
+ "revision": "9f1ab21f17f497974996ab74b3ff911165a7dbf9",
+ },
+ description=(
+ "Semantic Textual Similarity Benchmark (STSbenchmark) dataset, "
+ + "translated into target languages using DeepL API,"
+ + "then rendered into images."
+ + "built upon multi-sts created by Philip May"
+ ),
+ reference="https://arxiv.org/abs/2402.08183/",
+ type="VisualSTS",
+ category="i2i",
+ modalities=["image"],
+ eval_splits=_SPLITS,
+ eval_langs=_LANGUAGES,
+ main_score="cosine_spearman",
+ date=("2012-01-01", "2017-12-31"),
+ domains=["News", "Social", "Web", "Spoken", "Written"],
+ task_subtypes=[],
+ license="not specified",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="rendered",
+ bibtex_citation="""@article{xiao2024pixel,
+ title={Pixel Sentence Representation Learning},
+ author={Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al},
+ journal={arXiv preprint arXiv:2402.08183},
+ year={2024}
+}""",
+ descriptive_stats={
+ "n_samples": {"dev": 15000, "test": 13790},
+ "avg_character_length": {"dev": 1.0, "test": 1.0},
+ },
+ )
+
+ @property
+ def metadata_dict(self) -> dict[str, str]:
+ metadata_dict = super().metadata_dict
+ metadata_dict["min_score"] = 0
+ metadata_dict["max_score"] = 5
+ return metadata_dict
diff --git a/mteb/tasks/Image/VisualSTS/multilingual/__init__.py b/mteb/tasks/Image/VisualSTS/multilingual/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/mteb/tasks/Image/ZeroshotClassification/__init__.py b/mteb/tasks/Image/ZeroshotClassification/__init__.py
new file mode 100644
index 0000000000..1baf75604b
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/__init__.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+from .eng.Birdsnap import *
+from .eng.Caltech101 import *
+from .eng.CIFAR import *
+from .eng.CLEVR import *
+from .eng.Country211 import *
+from .eng.DTD import *
+from .eng.EuroSAT import *
+from .eng.FER2013 import *
+from .eng.FGVCAircraft import *
+from .eng.Food101 import *
+from .eng.GTSRB import *
+from .eng.Imagenet1k import *
+from .eng.MNIST import *
+from .eng.OxfordPets import *
+from .eng.PatchCamelyon import *
+from .eng.RenderedSST2 import *
+from .eng.RESISC45 import *
+from .eng.SciMMIR import *
+from .eng.StanfordCars import *
+from .eng.STL10 import *
+from .eng.SUN397 import *
+from .eng.UCF101 import *
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/Birdsnap.py b/mteb/tasks/Image/ZeroshotClassification/eng/Birdsnap.py
new file mode 100644
index 0000000000..14609d08a6
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/Birdsnap.py
@@ -0,0 +1,55 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class BirdsnapClassification(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="BirdsnapZeroShot",
+ description="Classifying bird images from 500 species.",
+ reference="https://openaccess.thecvf.com/content_cvpr_2014/html/Berg_Birdsnap_Large-scale_Fine-grained_2014_CVPR_paper.html",
+ dataset={
+ "path": "isaacchung/birdsnap",
+ "revision": "fd23015508be94f0b5b59d61630e4ea2536509e4",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2013-01-01",
+ "2014-01-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image", "text"],
+ sample_creation="created",
+ bibtex_citation="""@InProceedings{Berg_2014_CVPR,
+ author = {Berg, Thomas and Liu, Jiongxin and Woo Lee, Seung and Alexander, Michelle L. and Jacobs, David W. and Belhumeur, Peter N.},
+ title = {Birdsnap: Large-scale Fine-grained Visual Categorization of Birds},
+ booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ month = {June},
+ year = {2014}
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 1851},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
+
+ # Override default column name in the subclass
+ label_column_name: str = "common"
+
+ def get_candidate_labels(self) -> list[str]:
+ return [
+ f"a photo of a {name}, a type of bird."
+ for name in self.dataset["test"].features[self.label_column_name].names
+ ]
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/CIFAR.py b/mteb/tasks/Image/ZeroshotClassification/eng/CIFAR.py
new file mode 100644
index 0000000000..91b0b159ef
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/CIFAR.py
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class CIFAR10ZeroShotClassification(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="CIFAR10ZeroShot",
+ description="Classifying images from 10 classes.",
+ reference="https://huggingface.co/datasets/uoft-cs/cifar10",
+ dataset={
+ "path": "uoft-cs/cifar10",
+ "revision": "0b2714987fa478483af9968de7c934580d0bb9a2",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2008-01-01",
+ "2009-01-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="created",
+ bibtex_citation=""" @TECHREPORT{Krizhevsky09learningmultiple,
+ author = {Alex Krizhevsky},
+ title = {Learning multiple layers of features from tiny images},
+ institution = {},
+ year = {2009}
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 10000},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
+ image_column_name: str = "img"
+
+ def get_candidate_labels(self) -> list[str]:
+ return [
+ f"a photo of a {name}."
+ for name in self.dataset["test"].features[self.label_column_name].names
+ ]
+
+
+class CIFAR100ZeroShotClassification(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="CIFAR100ZeroShot",
+ description="Classifying images from 100 classes.",
+ reference="https://huggingface.co/datasets/uoft-cs/cifar100",
+ dataset={
+ "path": "uoft-cs/cifar100",
+ "revision": "aadb3af77e9048adbea6b47c21a81e47dd092ae5",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2008-01-01",
+ "2009-01-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="created",
+ bibtex_citation=""" @TECHREPORT{Krizhevsky09learningmultiple,
+ author = {Alex Krizhevsky},
+ title = {Learning multiple layers of features from tiny images},
+ institution = {},
+ year = {2009}
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 10000},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
+ image_column_name: str = "img"
+ label_column_name: str = "fine_label"
+
+ def get_candidate_labels(self) -> list[str]:
+ return [
+ f"a photo of a {name}."
+ for name in self.dataset["test"].features[self.label_column_name].names
+ ]
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/CLEVR.py b/mteb/tasks/Image/ZeroshotClassification/eng/CLEVR.py
new file mode 100644
index 0000000000..9b7397e24b
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/CLEVR.py
@@ -0,0 +1,111 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class CLEVR(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="CLEVRZeroShot",
+ description="CLEVR closest object distance identification task.",
+ reference="https://openaccess.thecvf.com/content_cvpr_2017/html/Johnson_CLEVR_A_Diagnostic_CVPR_2017_paper.html",
+ dataset={
+ "path": "clip-benchmark/wds_vtab-clevr_closest_object_distance",
+ "revision": "ec9c04224a95836ca0344a6000ec8d8bc8a6d4f2",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2016-01-01", "2016-12-20"),
+ domains=["Constructed"],
+ task_subtypes=["Object recognition"],
+ license="cc-by-4.0",
+ annotations_creators="human-annotated",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="created",
+ bibtex_citation="""\
+@InProceedings{Johnson_2017_CVPR,
+author = {Johnson, Justin and Hariharan, Bharath and van der Maaten, Laurens and Fei-Fei, Li and Lawrence Zitnick, C. and Girshick, Ross},
+title = {CLEVR: A Diagnostic Dataset for Compositional Language and Elementary Visual Reasoning},
+booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+month = {July},
+year = {2017}
+}""",
+ descriptive_stats={
+ "n_samples": {"test": 15000},
+ "avg_character_length": {"test": 0},
+ },
+ )
+
+ image_column_name: str = "webp"
+ label_column_name: str = "cls"
+
+ def get_candidate_labels(self) -> list[str]:
+ labels = [
+ "very nearby",
+ "nearby",
+ "near",
+ "", # missing this class name in the original dataset: https://huggingface.co/datasets/clip-benchmark/wds_vtab-clevr_closest_object_distance/blob/main/classnames.txt
+ "distant",
+ "very distant",
+ ]
+
+ return [f"{c} shapes." for c in labels]
+
+
+class CLEVRCount(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="CLEVRCountZeroShot",
+ description="CLEVR count objects task.",
+ reference="https://openaccess.thecvf.com/content_cvpr_2017/html/Johnson_CLEVR_A_Diagnostic_CVPR_2017_paper.html",
+ dataset={
+ "path": "clip-benchmark/wds_vtab-clevr_count_all",
+ "revision": "8b5dce4d5393a04fb58b9261b22a881b02e379b1",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2016-01-01", "2016-12-20"),
+ domains=["Constructed"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="human-annotated",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="created",
+ bibtex_citation="""\
+@InProceedings{Johnson_2017_CVPR,
+author = {Johnson, Justin and Hariharan, Bharath and van der Maaten, Laurens and Fei-Fei, Li and Lawrence Zitnick, C. and Girshick, Ross},
+title = {CLEVR: A Diagnostic Dataset for Compositional Language and Elementary Visual Reasoning},
+booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+month = {July},
+year = {2017}
+}""",
+ descriptive_stats={
+ "n_samples": {"test": 15000},
+ "avg_character_length": {"test": 0},
+ },
+ )
+
+ image_column_name: str = "webp"
+ label_column_name: str = "cls"
+
+ def get_candidate_labels(self) -> list[str]:
+ labels = [
+ "three",
+ "four",
+ "five",
+ "six",
+ "seven",
+ "eight",
+ "nine",
+ "ten",
+ ]
+ return [f"a picture of {c} objects" for c in labels]
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/Caltech101.py b/mteb/tasks/Image/ZeroshotClassification/eng/Caltech101.py
new file mode 100644
index 0000000000..749ac71273
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/Caltech101.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class Caltech101Classification(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="Caltech101ZeroShot",
+ description="Classifying images of 101 widely varied objects.",
+ reference="https://ieeexplore.ieee.org/document/1384978",
+ dataset={
+ "path": "HuggingFaceM4/Caltech-101",
+ "name": "with_background_category",
+ "revision": "851374102055782c84f89b1b4e9d128a6568847b",
+ "trust_remote_code": True,
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2003-01-01",
+ "2004-01-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="created",
+ bibtex_citation="""@INPROCEEDINGS{1384978,
+ author={Li Fei-Fei and Fergus, R. and Perona, P.},
+ booktitle={2004 Conference on Computer Vision and Pattern Recognition Workshop},
+ title={Learning Generative Visual Models from Few Training Examples: An Incremental Bayesian Approach Tested on 101 Object Categories},
+ year={2004},
+ volume={},
+ number={},
+ pages={178-178},
+ keywords={Bayesian methods;Testing;Humans;Maximum likelihood estimation;Assembly;Shape;Machine vision;Image recognition;Parameter estimation;Image databases},
+ doi={10.1109/CVPR.2004.383}}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 6084},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
+
+ def get_candidate_labels(self) -> list[str]:
+ return [
+ f"a photo of a {name}."
+ for name in self.dataset["test"].features[self.label_column_name].names
+ ]
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/Country211.py b/mteb/tasks/Image/ZeroshotClassification/eng/Country211.py
new file mode 100644
index 0000000000..eb0dd5158b
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/Country211.py
@@ -0,0 +1,56 @@
+from __future__ import annotations
+
+import os
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class Country211Classification(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="Country211ZeroShot",
+ description="Classifying images of 211 countries.",
+ reference="https://huggingface.co/datasets/clip-benchmark/wds_country211",
+ dataset={
+ "path": "clip-benchmark/wds_country211",
+ "revision": "1699f138f0558342a1cbf99f7cf36b4361bb5ebc",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2020-01-01",
+ "2021-02-26",
+ ), # Estimated range for the collection of reviews
+ domains=["Scene"],
+ task_subtypes=["Scene recognition"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@article{radford2021learning,
+ title={Learning Transferable Visual Models From Natural Language Supervision},
+ author={Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and others},
+ journal={arXiv preprint arXiv:2103.00020},
+ year={2021}
+ }""",
+ descriptive_stats={
+ "n_samples": {"test": 21100},
+ "avg_character_length": {"test": 0},
+ },
+ )
+
+ image_column_name: str = "jpg"
+ label_column_name: str = "cls"
+
+ def get_candidate_labels(self) -> list[str]:
+ path = os.path.dirname(__file__)
+ with open(os.path.join(path, "templates/Country211_labels.txt")) as f:
+ labels = f.readlines()
+
+ return [f"a photo showing the country of {c}." for c in labels]
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/DTD.py b/mteb/tasks/Image/ZeroshotClassification/eng/DTD.py
new file mode 100644
index 0000000000..2d182e0854
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/DTD.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class DTDClassification(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="DTDZeroShot",
+ description="Describable Textures Dataset in 47 categories.",
+ reference="https://www.robots.ox.ac.uk/~vgg/data/dtd/",
+ dataset={
+ "path": "tanganke/dtd",
+ "revision": "d2afa97d9f335b1a6b3b09c637aef667f98f966e",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2014-01-01",
+ "2014-03-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Textures recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image", "text"],
+ sample_creation="created",
+ bibtex_citation="""@InProceedings{cimpoi14describing,
+ Author = {M. Cimpoi and S. Maji and I. Kokkinos and S. Mohamed and and A. Vedaldi},
+ Title = {Describing Textures in the Wild},
+ Booktitle = {Proceedings of the {IEEE} Conf. on Computer Vision and Pattern Recognition ({CVPR})},
+ Year = {2014}}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 1880},
+ "avg_character_length": {"test": 456},
+ },
+ )
+
+ def get_candidate_labels(self) -> list[str]:
+ return [
+ f"a photo of {name} texture."
+ for name in self.dataset["test"].features[self.label_column_name].names
+ ]
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/EuroSAT.py b/mteb/tasks/Image/ZeroshotClassification/eng/EuroSAT.py
new file mode 100644
index 0000000000..85a1b13e5d
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/EuroSAT.py
@@ -0,0 +1,64 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class EuroSATClassification(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="EuroSATZeroShot",
+ description="Classifying satellite images.",
+ reference="https://ieeexplore.ieee.org/document/8736785",
+ dataset={
+ "path": "timm/eurosat-rgb",
+ "revision": "b4e28552cd5f3932b6abc37eb20d3e84901ad728",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2019-01-01",
+ "2019-03-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Scene recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image", "text"],
+ sample_creation="created",
+ bibtex_citation="""@ARTICLE{8736785,
+ author={Helber, Patrick and Bischke, Benjamin and Dengel, Andreas and Borth, Damian},
+ journal={IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing},
+ title={EuroSAT: A Novel Dataset and Deep Learning Benchmark for Land Use and Land Cover Classification},
+ year={2019},
+ volume={12},
+ number={7},
+ pages={2217-2226},
+ keywords={Satellites;Earth;Remote sensing;Machine learning;Spatial resolution;Feature extraction;Benchmark testing;Dataset;deep convolutional neural network;deep learning;earth observation;land cover classification;land use classification;machine learning;remote sensing;satellite image classification;satellite images},
+ doi={10.1109/JSTARS.2019.2918242}}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 5400},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
+
+ def get_candidate_labels(self) -> list[str]:
+ labels = [
+ "annual crop land",
+ "forest land",
+ "brushland or shrubland",
+ "highway or road",
+ "industrial land",
+ "pasture land",
+ "permanent crop land",
+ "residential land",
+ "river",
+ "sea or lake",
+ ]
+ return [f"a centered satellite photo of {name}." for name in labels]
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/FER2013.py b/mteb/tasks/Image/ZeroshotClassification/eng/FER2013.py
new file mode 100644
index 0000000000..a0a391e235
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/FER2013.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class FER2013Classification(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="FER2013ZeroShot",
+ description="Classifying facial emotions.",
+ reference="https://arxiv.org/abs/1412.6572",
+ dataset={
+ "path": "clip-benchmark/wds_fer2013",
+ "revision": "9399b94167523fe5c40b3a857e24ef931ee4395b",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2014-01-01",
+ "2014-12-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Emotion recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image", "text"],
+ sample_creation="created",
+ bibtex_citation="""@misc{goodfellow2015explainingharnessingadversarialexamples,
+ title={Explaining and Harnessing Adversarial Examples},
+ author={Ian J. Goodfellow and Jonathon Shlens and Christian Szegedy},
+ year={2015},
+ eprint={1412.6572},
+ archivePrefix={arXiv},
+ primaryClass={stat.ML},
+ url={https://arxiv.org/abs/1412.6572},
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 7178},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
+ image_column_name: str = "jpg"
+ label_column_name: str = "cls"
+
+ def get_candidate_labels(self) -> list[str]:
+ labels = ["angry", "disgust", "fear", "happy", "neutral", "sad", "surprise"]
+ return [f"a photo of a {name} looking face." for name in labels]
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/FGVCAircraft.py b/mteb/tasks/Image/ZeroshotClassification/eng/FGVCAircraft.py
new file mode 100644
index 0000000000..65af473d3f
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/FGVCAircraft.py
@@ -0,0 +1,56 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class FGVCAircraftClassification(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="FGVCAircraftZeroShot",
+ description="Classifying aircraft images from 41 manufacturers and 102 variants.",
+ reference="https://arxiv.org/abs/1306.5151",
+ dataset={
+ "path": "HuggingFaceM4/FGVC-Aircraft",
+ "revision": "91860adfc9a09aabca5cddb5247442109b38e213",
+ "trust_remote_code": True,
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2009-01-01",
+ "2010-01-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="created",
+ bibtex_citation="""@misc{maji2013finegrainedvisualclassificationaircraft,
+ title={Fine-Grained Visual Classification of Aircraft},
+ author={Subhransu Maji and Esa Rahtu and Juho Kannala and Matthew Blaschko and Andrea Vedaldi},
+ year={2013},
+ eprint={1306.5151},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV},
+ url={https://arxiv.org/abs/1306.5151},
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 3333},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
+ label_column_name: str = "variant" ## could be family, manufacturer, or variant. Variant has the higher number of classes.
+
+ def get_candidate_labels(self) -> list[str]:
+ return [
+ f"a photo of a {name}, a type of aircraft."
+ for name in self.dataset["test"].features[self.label_column_name].names
+ ]
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/Food101.py b/mteb/tasks/Image/ZeroshotClassification/eng/Food101.py
new file mode 100644
index 0000000000..cc64484e65
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/Food101.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class Food101Classification(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="Food101ZeroShot",
+ description="Classifying food.",
+ reference="https://huggingface.co/datasets/ethz/food101",
+ dataset={
+ "path": "ethz/food101",
+ "revision": "e06acf2a88084f04bce4d4a525165d68e0a36c38",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["validation"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2013-01-01",
+ "2014-01-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Web"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="created",
+ bibtex_citation=""" @inproceedings{bossard14,
+ title = {Food-101 -- Mining Discriminative Components with Random Forests},
+ author = {Bossard, Lukas and Guillaumin, Matthieu and Van Gool, Luc},
+ booktitle = {European Conference on Computer Vision},
+ year = {2014}
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"validation": 25300},
+ "avg_character_length": {"validation": 431.4},
+ },
+ )
+
+ def get_candidate_labels(self) -> list[str]:
+ return [
+ f"a photo of {name}, a type of food."
+ for name in self.dataset["validation"]
+ .features[self.label_column_name]
+ .names
+ ]
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/GTSRB.py b/mteb/tasks/Image/ZeroshotClassification/eng/GTSRB.py
new file mode 100644
index 0000000000..e08866b6bd
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/GTSRB.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+import os
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class GTSRBClassification(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="GTSRBZeroShot",
+ description="""The German Traffic Sign Recognition Benchmark (GTSRB) is a multi-class classification dataset for traffic signs. It consists of dataset of more than 50,000 traffic sign images. The dataset comprises 43 classes with unbalanced class frequencies.""",
+ reference="https://benchmark.ini.rub.de/",
+ dataset={
+ "path": "clip-benchmark/wds_gtsrb",
+ "revision": "1c13eff0803d2b02c9dc8dfe85e67770b3f0f3c5",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2011-01-01",
+ "2011-12-01",
+ ), # Estimated range for the collection of reviews
+ task_subtypes=["Activity recognition"],
+ domains=["Scene"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@INPROCEEDINGS{6033395,
+ author={Stallkamp, Johannes and Schlipsing, Marc and Salmen, Jan and Igel, Christian},
+ booktitle={The 2011 International Joint Conference on Neural Networks},
+ title={The German Traffic Sign Recognition Benchmark: A multi-class classification competition},
+ year={2011},
+ volume={},
+ number={},
+ pages={1453-1460},
+ keywords={Humans;Training;Image color analysis;Benchmark testing;Lead;Histograms;Image resolution},
+ doi={10.1109/IJCNN.2011.6033395}}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 12630},
+ "avg_character_length": {"test": 0},
+ },
+ )
+
+ image_column_name: str = "webp"
+ label_column_name: str = "cls"
+
+ def get_candidate_labels(self) -> list[str]:
+ path = os.path.dirname(__file__)
+ with open(os.path.join(path, "templates/GTSRB_labels.txt")) as f:
+ labels = f.readlines()
+
+ return [f"a close up photo of a '{c}' traffic sign." for c in labels]
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/Imagenet1k.py b/mteb/tasks/Image/ZeroshotClassification/eng/Imagenet1k.py
new file mode 100644
index 0000000000..53dce7feb1
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/Imagenet1k.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+import os
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class Imagenet1kClassification(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="Imagenet1kZeroShot",
+ description="ImageNet, a large-scale ontology of images built upon the backbone of the WordNet structure.",
+ reference="https://ieeexplore.ieee.org/document/5206848",
+ dataset={
+ "path": "clip-benchmark/wds_imagenet1k",
+ "revision": "b24c7a5a3ef12df09089055d1795e2ce7c7e7397",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2010-01-01",
+ "2012-01-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Scene"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="human-annotated",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@article{deng2009imagenet,
+ title={ImageNet: A large-scale hierarchical image database},
+ author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li},
+ journal={2009 IEEE Conference on Computer Vision and Pattern Recognition},
+ pages={248--255},
+ year={2009},
+ organization={Ieee}
+ }""",
+ descriptive_stats={
+ "n_samples": {"test": 37200},
+ "avg_character_length": {"test": 0},
+ },
+ )
+ image_column_name: str = "jpg"
+ label_column_name: str = "cls"
+
+ def get_candidate_labels(self) -> list[str]:
+ path = os.path.dirname(__file__)
+ with open(os.path.join(path, "templates/Imagenet1k_labels.txt")) as f:
+ labels = f.readlines()
+
+ return [f"a photo of {c}." for c in labels]
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/MNIST.py b/mteb/tasks/Image/ZeroshotClassification/eng/MNIST.py
new file mode 100644
index 0000000000..6433104c90
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/MNIST.py
@@ -0,0 +1,52 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class MNISTClassification(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="MNISTZeroShot",
+ description="Classifying handwritten digits.",
+ reference="https://en.wikipedia.org/wiki/MNIST_database",
+ dataset={
+ "path": "ylecun/mnist",
+ "revision": "77f3279092a1c1579b2250db8eafed0ad422088c",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2010-01-01",
+ "2010-04-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image", "text"],
+ sample_creation="created",
+ bibtex_citation="""@article{lecun2010mnist,
+ title={MNIST handwritten digit database},
+ author={LeCun, Yann and Cortes, Corinna and Burges, CJ},
+ journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},
+ volume={2},
+ year={2010}
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 10000},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
+
+ def get_candidate_labels(self) -> list[str]:
+ return [
+ f"a photo of the number: '{name}'."
+ for name in self.dataset["test"].features[self.label_column_name].names
+ ]
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/OxfordPets.py b/mteb/tasks/Image/ZeroshotClassification/eng/OxfordPets.py
new file mode 100644
index 0000000000..372d2fa7bf
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/OxfordPets.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class OxfordPetsClassification(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="OxfordPetsZeroShot",
+ description="Classifying animal images.",
+ reference="https://arxiv.org/abs/1306.5151",
+ dataset={
+ "path": "isaacchung/OxfordPets",
+ "revision": "557b480fae8d69247be74d9503b378a09425096f",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2009-01-01",
+ "2010-01-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="created",
+ bibtex_citation="""@misc{maji2013finegrainedvisualclassificationaircraft,
+ title={Fine-Grained Visual Classification of Aircraft},
+ author={Subhransu Maji and Esa Rahtu and Juho Kannala and Matthew Blaschko and Andrea Vedaldi},
+ year={2013},
+ eprint={1306.5151},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV},
+ url={https://arxiv.org/abs/1306.5151},
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 3669},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
+
+ def get_candidate_labels(self) -> list[str]:
+ return [
+ f"a photo of a {name}, a type of pet."
+ for name in self.dataset["test"].features[self.label_column_name].names
+ ]
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/PatchCamelyon.py b/mteb/tasks/Image/ZeroshotClassification/eng/PatchCamelyon.py
new file mode 100644
index 0000000000..24b3e7a4b1
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/PatchCamelyon.py
@@ -0,0 +1,70 @@
+from __future__ import annotations
+
+import os
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class PatchCamelyonClassification(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="PatchCamelyonZeroShot",
+ description="""Histopathology diagnosis classification dataset.""",
+ reference="https://link.springer.com/chapter/10.1007/978-3-030-00934-2_24",
+ dataset={
+ "path": "clip-benchmark/wds_vtab-pcam",
+ "revision": "502695fe1a141108650e3c5b91c8b5e0ff84ed49",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2018-01-01",
+ "2018-12-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Medical"],
+ task_subtypes=["Tumor detection"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@InProceedings{10.1007/978-3-030-00934-2_24,
+author="Veeling, Bastiaan S.
+and Linmans, Jasper
+and Winkens, Jim
+and Cohen, Taco
+and Welling, Max",
+editor="Frangi, Alejandro F.
+and Schnabel, Julia A.
+and Davatzikos, Christos
+and Alberola-L{\'o}pez, Carlos
+and Fichtinger, Gabor",
+title="Rotation Equivariant CNNs for Digital Pathology",
+booktitle="Medical Image Computing and Computer Assisted Intervention -- MICCAI 2018",
+year="2018",
+publisher="Springer International Publishing",
+address="Cham",
+pages="210--218",
+abstract="We propose a new model for digital pathology segmentation, based on the observation that histopathology images are inherently symmetric under rotation and reflection. Utilizing recent findings on rotation equivariant CNNs, the proposed model leverages these symmetries in a principled manner. We present a visual analysis showing improved stability on predictions, and demonstrate that exploiting rotation equivariance significantly improves tumor detection performance on a challenging lymph node metastases dataset. We further present a novel derived dataset to enable principled comparison of machine learning models, in combination with an initial benchmark. Through this dataset, the task of histopathology diagnosis becomes accessible as a challenging benchmark for fundamental machine learning research.",
+isbn="978-3-030-00934-2"
+}
+""",
+ descriptive_stats={
+ "n_samples": {"test": 32768},
+ "avg_character_length": {"test": 0},
+ },
+ )
+ image_column_name = "webp"
+ label_column_name = "cls"
+
+ def get_candidate_labels(self) -> list[str]:
+ path = os.path.dirname(__file__)
+ with open(os.path.join(path, "templates/PatchCamelyon_labels.txt")) as f:
+ labels = f.readlines()
+
+ return [f"histopathology image of {c}" for c in labels]
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/RESISC45.py b/mteb/tasks/Image/ZeroshotClassification/eng/RESISC45.py
new file mode 100644
index 0000000000..e58da7863e
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/RESISC45.py
@@ -0,0 +1,55 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class RESISC45Classification(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="RESISC45ZeroShot",
+ description="Remote Sensing Image Scene Classification by Northwestern Polytechnical University (NWPU).",
+ reference="https://ieeexplore.ieee.org/abstract/document/7891544",
+ dataset={
+ "path": "timm/resisc45",
+ "revision": "fe12fc5f1b7606543b0355eda392f1ddc54625c6",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2017-01-01",
+ "2017-03-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image", "text"],
+ sample_creation="created",
+ bibtex_citation="""@ARTICLE{7891544,
+ author={Cheng, Gong and Han, Junwei and Lu, Xiaoqiang},
+ journal={Proceedings of the IEEE},
+ title={Remote Sensing Image Scene Classification: Benchmark and State of the Art},
+ year={2017},
+ volume={105},
+ number={10},
+ pages={1865-1883},
+ keywords={Remote sensing;Benchmark testing;Spatial resolution;Social network services;Satellites;Image analysis;Machine learning;Unsupervised learning;Classification;Benchmark data set;deep learning;handcrafted features;remote sensing image;scene classification;unsupervised feature learning},
+ doi={10.1109/JPROC.2017.2675998}}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 6300},
+ "avg_character_length": {"test": 256},
+ },
+ )
+
+ def get_candidate_labels(self) -> list[str]:
+ return [
+ f"satellite imagery of {name}."
+ for name in self.dataset["test"].features[self.label_column_name].names
+ ]
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/RenderedSST2.py b/mteb/tasks/Image/ZeroshotClassification/eng/RenderedSST2.py
new file mode 100644
index 0000000000..cad88534a2
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/RenderedSST2.py
@@ -0,0 +1,43 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class RenderedSST2(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="RenderedSST2",
+ description="RenderedSST2.",
+ reference="https://huggingface.co/datasets/clip-benchmark/wds_renderedsst2",
+ dataset={
+ "path": "clip-benchmark/wds_renderedsst2",
+ "revision": "66b9a461eda025201dd147e5f390f5984c33643a",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2016-01-01", "2016-12-31"),
+ domains=["Reviews"],
+ task_subtypes=[],
+ license="mit",
+ annotations_creators="human-annotated",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="created",
+ bibtex_citation="""d""",
+ descriptive_stats={
+ "n_samples": {"test": 1820},
+ "avg_character_length": {"test": 10.0},
+ },
+ )
+
+ # Override default column names in the subclass
+ image_column_name: str = "png"
+ label_column_name: str = "cls"
+
+ def get_candidate_labels(self) -> list[str]:
+ return ["a negative review of a movie", "a positive review of a movie"]
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/STL10.py b/mteb/tasks/Image/ZeroshotClassification/eng/STL10.py
new file mode 100644
index 0000000000..67357adc88
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/STL10.py
@@ -0,0 +1,60 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class STL10Classification(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="STL10ZeroShot",
+ description="Classifying 96x96 images from 10 classes.",
+ reference="https://cs.stanford.edu/~acoates/stl10/",
+ dataset={
+ "path": "tanganke/stl10",
+ "revision": "49ae7f94508f7feae62baf836db284306eab0b0f",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2011-01-01",
+ "2011-04-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Object recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image", "text"],
+ sample_creation="created",
+ bibtex_citation="""@InProceedings{pmlr-v15-coates11a,
+ title = {An Analysis of Single-Layer Networks in Unsupervised Feature Learning},
+ author = {Coates, Adam and Ng, Andrew and Lee, Honglak},
+ booktitle = {Proceedings of the Fourteenth International Conference on Artificial Intelligence and Statistics},
+ pages = {215--223},
+ year = {2011},
+ editor = {Gordon, Geoffrey and Dunson, David and Dudík, Miroslav},
+ volume = {15},
+ series = {Proceedings of Machine Learning Research},
+ address = {Fort Lauderdale, FL, USA},
+ month = {11--13 Apr},
+ publisher = {PMLR},
+ pdf = {http://proceedings.mlr.press/v15/coates11a/coates11a.pdf},
+ url = {https://proceedings.mlr.press/v15/coates11a.html},
+ }
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 8000},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
+
+ def get_candidate_labels(self) -> list[str]:
+ return [
+ f"a photo of a {name}."
+ for name in self.dataset["test"].features[self.label_column_name].names
+ ]
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/SUN397.py b/mteb/tasks/Image/ZeroshotClassification/eng/SUN397.py
new file mode 100644
index 0000000000..c28bf146f1
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/SUN397.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class SUN397Classification(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="SUN397ZeroShot",
+ description="Large scale scene recognition in 397 categories.",
+ reference="https://ieeexplore.ieee.org/abstract/document/5539970",
+ dataset={
+ "path": "dpdl-benchmark/sun397",
+ "revision": "7e6af6a2499ad708618be868e1471eac0aca1168",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2017-01-01",
+ "2017-03-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Encyclopaedic"],
+ task_subtypes=["Scene recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image", "text"],
+ sample_creation="created",
+ bibtex_citation="""@INPROCEEDINGS{5539970,
+ author={Xiao, Jianxiong and Hays, James and Ehinger, Krista A. and Oliva, Aude and Torralba, Antonio},
+ booktitle={2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition},
+ title={SUN database: Large-scale scene recognition from abbey to zoo},
+ year={2010},
+ volume={},
+ number={},
+ pages={3485-3492},
+ doi={10.1109/CVPR.2010.5539970}}
+ """,
+ descriptive_stats={
+ "n_samples": {"test": 21750},
+ "avg_character_length": {"test": 256},
+ },
+ )
+
+ def get_candidate_labels(self) -> list[str]:
+ """Convert labels as such:
+ - /b/boat_deck -> boat deck
+ - /c/church/outdoor -> church outdoor
+ """
+ labels = []
+ for name in self.dataset["test"].features[self.label_column_name].names:
+ name = " ".join(name.split("/")[2:]).replace("_", " ")
+ labels.append(f"a photo of a {name}.")
+ return labels
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/SciMMIR.py b/mteb/tasks/Image/ZeroshotClassification/eng/SciMMIR.py
new file mode 100644
index 0000000000..abb651612f
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/SciMMIR.py
@@ -0,0 +1,72 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class SciMMIR(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="SciMMIR",
+ description="SciMMIR.",
+ reference="https://huggingface.co/datasets/m-a-p/SciMMIR",
+ dataset={
+ "path": "m-a-p/SciMMIR",
+ "revision": "eea276dc58c52eab33e9476acb137ff5530b78e9",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2023-05-01", "2023-10-30"),
+ domains=["Academic"],
+ task_subtypes=["Caption Pairing", "Rendered Texts Understanding"],
+ license="not specified",
+ annotations_creators="human-annotated",
+ dialect=[],
+ modalities=["text", "image"],
+ sample_creation="created",
+ bibtex_citation="""\
+@misc{wu2024scimmirbenchmarkingscientificmultimodal,
+ title={SciMMIR: Benchmarking Scientific Multi-modal Information Retrieval},
+ author={Siwei Wu and Yizhi Li and Kang Zhu and Ge Zhang and Yiming Liang and Kaijing Ma and Chenghao Xiao and Haoran Zhang and Bohao Yang and Wenhu Chen and Wenhao Huang and Noura Al Moubayed and Jie Fu and Chenghua Lin},
+ year={2024},
+ eprint={2401.13478},
+ archivePrefix={arXiv},
+ primaryClass={cs.IR},
+ url={https://arxiv.org/abs/2401.13478},
+}""",
+ descriptive_stats={
+ "n_samples": {"test": 16263},
+ "avg_character_length": {"test": 0},
+ },
+ )
+
+ label_column_name: str = "class"
+
+ def dataset_transform(self):
+ class_code = {
+ "fig_result": 0,
+ "fig_illustration": 1,
+ "fig_architecture": 2,
+ "table_parameter": 3,
+ "table_result": 4,
+ }
+ for split in self.metadata.eval_splits:
+ self.dataset[split] = self.dataset[split].map(
+ lambda example: {
+ "image": example["image"],
+ "class": class_code[example[self.label_column_name]],
+ }
+ )
+
+ def get_candidate_labels(self) -> list[str]:
+ return [
+ "a figure of results",
+ "a figure of an illustration",
+ "a figure of an architecture",
+ "a table of parameters",
+ "a table of results",
+ ]
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/StanfordCars.py b/mteb/tasks/Image/ZeroshotClassification/eng/StanfordCars.py
new file mode 100644
index 0000000000..d3e01a34ca
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/StanfordCars.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class StanfordCarsClassification(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="StanfordCarsZeroShot",
+ description="Classifying car images from 96 makes.",
+ reference="https://pure.mpg.de/rest/items/item_2029263/component/file_2029262/content",
+ dataset={
+ "path": "isaacchung/StanfordCars",
+ "revision": "09ffe9bc7864d3f1e851529e5c4b7e05601a04fb",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2013-01-01",
+ "2013-04-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Scene"],
+ task_subtypes=["Scene recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image", "text"],
+ sample_creation="created",
+ bibtex_citation="""@inproceedings{Krause2013CollectingAL,
+ title={Collecting a Large-scale Dataset of Fine-grained Cars},
+ author={Jonathan Krause and Jia Deng and Michael Stark and Li Fei-Fei},
+ year={2013},
+ url={https://api.semanticscholar.org/CorpusID:16632981}
+ }""",
+ descriptive_stats={
+ "n_samples": {"test": 8041},
+ "avg_character_length": {"test": 431.4},
+ },
+ )
+
+ def get_candidate_labels(self) -> list[str]:
+ return [
+ f"a photo of a {name}."
+ for name in self.dataset["test"].features[self.label_column_name].names
+ ]
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/UCF101.py b/mteb/tasks/Image/ZeroshotClassification/eng/UCF101.py
new file mode 100644
index 0000000000..b0d5293632
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/UCF101.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class UCF101Classification(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ name="UCF101ZeroShot",
+ description="""UCF101 is an action recognition data set of realistic
+action videos collected from YouTube, having 101 action categories. This
+version of the dataset does not contain images but images saved frame by
+frame. Train and test splits are generated based on the authors' first
+version train/test list.""",
+ reference="https://huggingface.co/datasets/flwrlabs/ucf101",
+ dataset={
+ "path": "flwrlabs/ucf101",
+ "revision": "1098eed48f2929443f47c39f3b5c814e16369c11",
+ },
+ type="ZeroShotClassification",
+ category="i2t",
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2012-01-01",
+ "2012-12-01",
+ ), # Estimated range for the collection of reviews
+ domains=["Scene"],
+ task_subtypes=["Activity recognition"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ modalities=["image"],
+ sample_creation="created",
+ bibtex_citation="""@misc{soomro2012ucf101dataset101human,
+ title={UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild},
+ author={Khurram Soomro and Amir Roshan Zamir and Mubarak Shah},
+ year={2012},
+ eprint={1212.0402},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV},
+ url={https://arxiv.org/abs/1212.0402},
+}""",
+ descriptive_stats={
+ "n_samples": {"test": 697222},
+ "avg_character_length": {"test": 0},
+ },
+ )
+
+ def get_candidate_labels(self) -> list[str]:
+ return [
+ f"a photo of {name}"
+ for name in self.dataset["test"].features[self.label_column_name].names
+ ]
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/__init__.py b/mteb/tasks/Image/ZeroshotClassification/eng/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/templates/Country211_labels.txt b/mteb/tasks/Image/ZeroshotClassification/eng/templates/Country211_labels.txt
new file mode 100644
index 0000000000..b7c09926c8
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/templates/Country211_labels.txt
@@ -0,0 +1,211 @@
+Andorra
+United Arab Emirates
+Afghanistan
+Antigua and Barbuda
+Anguilla
+Albania
+Armenia
+Angola
+Antarctica
+Argentina
+Austria
+Australia
+Aruba
+Aland Islands
+Azerbaijan
+Bosnia and Herzegovina
+Barbados
+Bangladesh
+Belgium
+Burkina Faso
+Bulgaria
+Bahrain
+Benin
+Bermuda
+Brunei Darussalam
+Bolivia
+Bonaire, Saint Eustatius and Saba
+Brazil
+Bahamas
+Bhutan
+Botswana
+Belarus
+Belize
+Canada
+DR Congo
+Central African Republic
+Switzerland
+Cote d'Ivoire
+Cook Islands
+Chile
+Cameroon
+China
+Colombia
+Costa Rica
+Cuba
+Cabo Verde
+Curacao
+Cyprus
+Czech Republic
+Germany
+Denmark
+Dominica
+Dominican Republic
+Algeria
+Ecuador
+Estonia
+Egypt
+Spain
+Ethiopia
+Finland
+Fiji
+Falkland Islands
+Faeroe Islands
+France
+Gabon
+United Kingdom
+Grenada
+Georgia
+French Guiana
+Guernsey
+Ghana
+Gibraltar
+Greenland
+Gambia
+Guadeloupe
+Greece
+South Georgia and South Sandwich Is.
+Guatemala
+Guam
+Guyana
+Hong Kong
+Honduras
+Croatia
+Haiti
+Hungary
+Indonesia
+Ireland
+Israel
+Isle of Man
+India
+Iraq
+Iran
+Iceland
+Italy
+Jersey
+Jamaica
+Jordan
+Japan
+Kenya
+Kyrgyz Republic
+Cambodia
+St. Kitts and Nevis
+North Korea
+South Korea
+Kuwait
+Cayman Islands
+Kazakhstan
+Laos
+Lebanon
+St. Lucia
+Liechtenstein
+Sri Lanka
+Liberia
+Lithuania
+Luxembourg
+Latvia
+Libya
+Morocco
+Monaco
+Moldova
+Montenegro
+Saint-Martin
+Madagascar
+Macedonia
+Mali
+Myanmar
+Mongolia
+Macau
+Martinique
+Mauritania
+Malta
+Mauritius
+Maldives
+Malawi
+Mexico
+Malaysia
+Mozambique
+Namibia
+New Caledonia
+Nigeria
+Nicaragua
+Netherlands
+Norway
+Nepal
+New Zealand
+Oman
+Panama
+Peru
+French Polynesia
+Papua New Guinea
+Philippines
+Pakistan
+Poland
+Puerto Rico
+Palestine
+Portugal
+Palau
+Paraguay
+Qatar
+Reunion
+Romania
+Serbia
+Russia
+Rwanda
+Saudi Arabia
+Solomon Islands
+Seychelles
+Sudan
+Sweden
+Singapore
+St. Helena
+Slovenia
+Svalbard and Jan Mayen Islands
+Slovakia
+Sierra Leone
+San Marino
+Senegal
+Somalia
+South Sudan
+El Salvador
+Sint Maarten
+Syria
+Eswatini
+Togo
+Thailand
+Tajikistan
+Timor-Leste
+Turkmenistan
+Tunisia
+Tonga
+Turkey
+Trinidad and Tobago
+Taiwan
+Tanzania
+Ukraine
+Uganda
+United States
+Uruguay
+Uzbekistan
+Vatican
+Venezuela
+British Virgin Islands
+United States Virgin Islands
+Vietnam
+Vanuatu
+Samoa
+Kosovo
+Yemen
+South Africa
+Zambia
+Zimbabwe
\ No newline at end of file
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/templates/GTSRB_labels.txt b/mteb/tasks/Image/ZeroshotClassification/eng/templates/GTSRB_labels.txt
new file mode 100644
index 0000000000..2049335509
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/templates/GTSRB_labels.txt
@@ -0,0 +1,43 @@
+red and white circle 20 kph speed limit
+red and white circle 30 kph speed limit
+red and white circle 50 kph speed limit
+red and white circle 60 kph speed limit
+red and white circle 70 kph speed limit
+red and white circle 80 kph speed limit
+end / de-restriction of 80 kph speed limit
+red and white circle 100 kph speed limit
+red and white circle 120 kph speed limit
+red and white circle red car and black car no passing
+red and white circle red truck and black car no passing
+red and white triangle road intersection warning
+white and yellow diamond priority road
+red and white upside down triangle yield right-of-way
+stop
+empty red and white circle
+red and white circle no truck entry
+red circle with white horizonal stripe no entry
+red and white triangle with exclamation mark warning
+red and white triangle with black left curve approaching warning
+red and white triangle with black right curve approaching warning
+red and white triangle with black double curve approaching warning
+red and white triangle rough / bumpy road warning
+red and white triangle car skidding / slipping warning
+red and white triangle with merging / narrow lanes warning
+red and white triangle with person digging / construction / road work warning
+red and white triangle with traffic light approaching warning
+red and white triangle with person walking warning
+red and white triangle with child and person walking warning
+red and white triangle with bicyle warning
+red and white triangle with snowflake / ice warning
+red and white triangle with deer warning
+white circle with gray strike bar no speed limit
+blue circle with white right turn arrow mandatory
+blue circle with white left turn arrow mandatory
+blue circle with white forward arrow mandatory
+blue circle with white forward or right turn arrow mandatory
+blue circle with white forward or left turn arrow mandatory
+blue circle with white keep right arrow mandatory
+blue circle with white keep left arrow mandatory
+blue circle with white arrows indicating a traffic circle
+white circle with gray strike bar indicating no passing for cars has ended
+white circle with gray strike bar indicating no passing for trucks has ended
\ No newline at end of file
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/templates/Imagenet1k_labels.txt b/mteb/tasks/Image/ZeroshotClassification/eng/templates/Imagenet1k_labels.txt
new file mode 100644
index 0000000000..666b01ac0b
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/templates/Imagenet1k_labels.txt
@@ -0,0 +1,1000 @@
+tench
+goldfish
+great white shark
+tiger shark
+hammerhead shark
+electric ray
+stingray
+rooster
+hen
+ostrich
+brambling
+goldfinch
+house finch
+junco
+indigo bunting
+American robin
+bulbul
+jay
+magpie
+chickadee
+American dipper
+kite (bird of prey)
+bald eagle
+vulture
+great grey owl
+fire salamander
+smooth newt
+newt
+spotted salamander
+axolotl
+American bullfrog
+tree frog
+tailed frog
+loggerhead sea turtle
+leatherback sea turtle
+mud turtle
+terrapin
+box turtle
+banded gecko
+green iguana
+Carolina anole
+desert grassland whiptail lizard
+agama
+frilled-necked lizard
+alligator lizard
+Gila monster
+European green lizard
+chameleon
+Komodo dragon
+Nile crocodile
+American alligator
+triceratops
+worm snake
+ring-necked snake
+eastern hog-nosed snake
+smooth green snake
+kingsnake
+garter snake
+water snake
+vine snake
+night snake
+boa constrictor
+African rock python
+Indian cobra
+green mamba
+sea snake
+Saharan horned viper
+eastern diamondback rattlesnake
+sidewinder rattlesnake
+trilobite
+harvestman
+scorpion
+yellow garden spider
+barn spider
+European garden spider
+southern black widow
+tarantula
+wolf spider
+tick
+centipede
+black grouse
+ptarmigan
+ruffed grouse
+prairie grouse
+peafowl
+quail
+partridge
+african grey parrot
+macaw
+sulphur-crested cockatoo
+lorikeet
+coucal
+bee eater
+hornbill
+hummingbird
+jacamar
+toucan
+duck
+red-breasted merganser
+goose
+black swan
+tusker
+echidna
+platypus
+wallaby
+koala
+wombat
+jellyfish
+sea anemone
+brain coral
+flatworm
+nematode
+conch
+snail
+slug
+sea slug
+chiton
+chambered nautilus
+Dungeness crab
+rock crab
+fiddler crab
+red king crab
+American lobster
+spiny lobster
+crayfish
+hermit crab
+isopod
+white stork
+black stork
+spoonbill
+flamingo
+little blue heron
+great egret
+bittern bird
+crane bird
+limpkin
+common gallinule
+American coot
+bustard
+ruddy turnstone
+dunlin
+common redshank
+dowitcher
+oystercatcher
+pelican
+king penguin
+albatross
+grey whale
+killer whale
+dugong
+sea lion
+Chihuahua
+Japanese Chin
+Maltese
+Pekingese
+Shih Tzu
+King Charles Spaniel
+Papillon
+toy terrier
+Rhodesian Ridgeback
+Afghan Hound
+Basset Hound
+Beagle
+Bloodhound
+Bluetick Coonhound
+Black and Tan Coonhound
+Treeing Walker Coonhound
+English foxhound
+Redbone Coonhound
+borzoi
+Irish Wolfhound
+Italian Greyhound
+Whippet
+Ibizan Hound
+Norwegian Elkhound
+Otterhound
+Saluki
+Scottish Deerhound
+Weimaraner
+Staffordshire Bull Terrier
+American Staffordshire Terrier
+Bedlington Terrier
+Border Terrier
+Kerry Blue Terrier
+Irish Terrier
+Norfolk Terrier
+Norwich Terrier
+Yorkshire Terrier
+Wire Fox Terrier
+Lakeland Terrier
+Sealyham Terrier
+Airedale Terrier
+Cairn Terrier
+Australian Terrier
+Dandie Dinmont Terrier
+Boston Terrier
+Miniature Schnauzer
+Giant Schnauzer
+Standard Schnauzer
+Scottish Terrier
+Tibetan Terrier
+Australian Silky Terrier
+Soft-coated Wheaten Terrier
+West Highland White Terrier
+Lhasa Apso
+Flat-Coated Retriever
+Curly-coated Retriever
+Golden Retriever
+Labrador Retriever
+Chesapeake Bay Retriever
+German Shorthaired Pointer
+Vizsla
+English Setter
+Irish Setter
+Gordon Setter
+Brittany dog
+Clumber Spaniel
+English Springer Spaniel
+Welsh Springer Spaniel
+Cocker Spaniel
+Sussex Spaniel
+Irish Water Spaniel
+Kuvasz
+Schipperke
+Groenendael dog
+Malinois
+Briard
+Australian Kelpie
+Komondor
+Old English Sheepdog
+Shetland Sheepdog
+collie
+Border Collie
+Bouvier des Flandres dog
+Rottweiler
+German Shepherd Dog
+Dobermann
+Miniature Pinscher
+Greater Swiss Mountain Dog
+Bernese Mountain Dog
+Appenzeller Sennenhund
+Entlebucher Sennenhund
+Boxer
+Bullmastiff
+Tibetan Mastiff
+French Bulldog
+Great Dane
+St. Bernard
+husky
+Alaskan Malamute
+Siberian Husky
+Dalmatian
+Affenpinscher
+Basenji
+pug
+Leonberger
+Newfoundland dog
+Great Pyrenees dog
+Samoyed
+Pomeranian
+Chow Chow
+Keeshond
+brussels griffon
+Pembroke Welsh Corgi
+Cardigan Welsh Corgi
+Toy Poodle
+Miniature Poodle
+Standard Poodle
+Mexican hairless dog (xoloitzcuintli)
+grey wolf
+Alaskan tundra wolf
+red wolf or maned wolf
+coyote
+dingo
+dhole
+African wild dog
+hyena
+red fox
+kit fox
+Arctic fox
+grey fox
+tabby cat
+tiger cat
+Persian cat
+Siamese cat
+Egyptian Mau
+cougar
+lynx
+leopard
+snow leopard
+jaguar
+lion
+tiger
+cheetah
+brown bear
+American black bear
+polar bear
+sloth bear
+mongoose
+meerkat
+tiger beetle
+ladybug
+ground beetle
+longhorn beetle
+leaf beetle
+dung beetle
+rhinoceros beetle
+weevil
+fly
+bee
+ant
+grasshopper
+cricket insect
+stick insect
+cockroach
+praying mantis
+cicada
+leafhopper
+lacewing
+dragonfly
+damselfly
+red admiral butterfly
+ringlet butterfly
+monarch butterfly
+small white butterfly
+sulphur butterfly
+gossamer-winged butterfly
+starfish
+sea urchin
+sea cucumber
+cottontail rabbit
+hare
+Angora rabbit
+hamster
+porcupine
+fox squirrel
+marmot
+beaver
+guinea pig
+common sorrel horse
+zebra
+pig
+wild boar
+warthog
+hippopotamus
+ox
+water buffalo
+bison
+ram (adult male sheep)
+bighorn sheep
+Alpine ibex
+hartebeest
+impala (antelope)
+gazelle
+arabian camel
+llama
+weasel
+mink
+European polecat
+black-footed ferret
+otter
+skunk
+badger
+armadillo
+three-toed sloth
+orangutan
+gorilla
+chimpanzee
+gibbon
+siamang
+guenon
+patas monkey
+baboon
+macaque
+langur
+black-and-white colobus
+proboscis monkey
+marmoset
+white-headed capuchin
+howler monkey
+titi monkey
+Geoffroy's spider monkey
+common squirrel monkey
+ring-tailed lemur
+indri
+Asian elephant
+African bush elephant
+red panda
+giant panda
+snoek fish
+eel
+silver salmon
+rock beauty fish
+clownfish
+sturgeon
+gar fish
+lionfish
+pufferfish
+abacus
+abaya
+academic gown
+accordion
+acoustic guitar
+aircraft carrier
+airliner
+airship
+altar
+ambulance
+amphibious vehicle
+analog clock
+apiary
+apron
+trash can
+assault rifle
+backpack
+bakery
+balance beam
+balloon
+ballpoint pen
+Band-Aid
+banjo
+baluster / handrail
+barbell
+barber chair
+barbershop
+barn
+barometer
+barrel
+wheelbarrow
+baseball
+basketball
+bassinet
+bassoon
+swimming cap
+bath towel
+bathtub
+station wagon
+lighthouse
+beaker
+military hat (bearskin or shako)
+beer bottle
+beer glass
+bell tower
+baby bib
+tandem bicycle
+bikini
+ring binder
+binoculars
+birdhouse
+boathouse
+bobsleigh
+bolo tie
+poke bonnet
+bookcase
+bookstore
+bottle cap
+hunting bow
+bow tie
+brass memorial plaque
+bra
+breakwater
+breastplate
+broom
+bucket
+buckle
+bulletproof vest
+high-speed train
+butcher shop
+taxicab
+cauldron
+candle
+cannon
+canoe
+can opener
+cardigan
+car mirror
+carousel
+tool kit
+cardboard box / carton
+car wheel
+automated teller machine
+cassette
+cassette player
+castle
+catamaran
+CD player
+cello
+mobile phone
+chain
+chain-link fence
+chain mail
+chainsaw
+storage chest
+chiffonier
+bell or wind chime
+china cabinet
+Christmas stocking
+church
+movie theater
+cleaver
+cliff dwelling
+cloak
+clogs
+cocktail shaker
+coffee mug
+coffeemaker
+spiral or coil
+combination lock
+computer keyboard
+candy store
+container ship
+convertible
+corkscrew
+cornet
+cowboy boot
+cowboy hat
+cradle
+construction crane
+crash helmet
+crate
+infant bed
+Crock Pot
+croquet ball
+crutch
+cuirass
+dam
+desk
+desktop computer
+rotary dial telephone
+diaper
+digital clock
+digital watch
+dining table
+dishcloth
+dishwasher
+disc brake
+dock
+dog sled
+dome
+doormat
+drilling rig
+drum
+drumstick
+dumbbell
+Dutch oven
+electric fan
+electric guitar
+electric locomotive
+entertainment center
+envelope
+espresso machine
+face powder
+feather boa
+filing cabinet
+fireboat
+fire truck
+fire screen
+flagpole
+flute
+folding chair
+football helmet
+forklift
+fountain
+fountain pen
+four-poster bed
+freight car
+French horn
+frying pan
+fur coat
+garbage truck
+gas mask or respirator
+gas pump
+goblet
+go-kart
+golf ball
+golf cart
+gondola
+gong
+gown
+grand piano
+greenhouse
+radiator grille
+grocery store
+guillotine
+hair clip
+hair spray
+half-track
+hammer
+hamper
+hair dryer
+hand-held computer
+handkerchief
+hard disk drive
+harmonica
+harp
+combine harvester
+hatchet
+holster
+home theater
+honeycomb
+hook
+hoop skirt
+gymnastic horizontal bar
+horse-drawn vehicle
+hourglass
+iPod
+clothes iron
+carved pumpkin
+jeans
+jeep
+T-shirt
+jigsaw puzzle
+rickshaw
+joystick
+kimono
+knee pad
+knot
+lab coat
+ladle
+lampshade
+laptop computer
+lawn mower
+lens cap
+letter opener
+library
+lifeboat
+lighter
+limousine
+ocean liner
+lipstick
+slip-on shoe
+lotion
+music speaker
+loupe magnifying glass
+sawmill
+magnetic compass
+messenger bag
+mailbox
+tights
+one-piece bathing suit
+manhole cover
+maraca
+marimba
+mask
+matchstick
+maypole
+maze
+measuring cup
+medicine cabinet
+megalith
+microphone
+microwave oven
+military uniform
+milk can
+minibus
+miniskirt
+minivan
+missile
+mitten
+mixing bowl
+mobile home
+ford model t
+modem
+monastery
+monitor
+moped
+mortar and pestle
+graduation cap
+mosque
+mosquito net
+vespa
+mountain bike
+tent
+computer mouse
+mousetrap
+moving van
+muzzle
+metal nail
+neck brace
+necklace
+baby pacifier
+notebook computer
+obelisk
+oboe
+ocarina
+odometer
+oil filter
+pipe organ
+oscilloscope
+overskirt
+bullock cart
+oxygen mask
+product packet / packaging
+paddle
+paddle wheel
+padlock
+paintbrush
+pajamas
+palace
+pan flute
+paper towel
+parachute
+parallel bars
+park bench
+parking meter
+railroad car
+patio
+payphone
+pedestal
+pencil case
+pencil sharpener
+perfume
+Petri dish
+photocopier
+plectrum
+Pickelhaube
+picket fence
+pickup truck
+pier
+piggy bank
+pill bottle
+pillow
+ping-pong ball
+pinwheel
+pirate ship
+drink pitcher
+block plane
+planetarium
+plastic bag
+plate rack
+farm plow
+plunger
+Polaroid camera
+pole
+police van
+poncho
+pool table
+soda bottle
+plant pot
+potter's wheel
+power drill
+prayer rug
+printer
+prison
+missile
+projector
+hockey puck
+punching bag
+purse
+quill
+quilt
+race car
+racket
+radiator
+radio
+radio telescope
+rain barrel
+recreational vehicle
+fishing casting reel
+reflex camera
+refrigerator
+remote control
+restaurant
+revolver
+rifle
+rocking chair
+rotisserie
+eraser
+rugby ball
+ruler measuring stick
+sneaker
+safe
+safety pin
+salt shaker
+sandal
+sarong
+saxophone
+scabbard
+weighing scale
+school bus
+schooner
+scoreboard
+CRT monitor
+screw
+screwdriver
+seat belt
+sewing machine
+shield
+shoe store
+shoji screen / room divider
+shopping basket
+shopping cart
+shovel
+shower cap
+shower curtain
+ski
+balaclava ski mask
+sleeping bag
+slide rule
+sliding door
+slot machine
+snorkel
+snowmobile
+snowplow
+soap dispenser
+soccer ball
+sock
+solar thermal collector
+sombrero
+soup bowl
+keyboard space bar
+space heater
+space shuttle
+spatula
+motorboat
+spider web
+spindle
+sports car
+spotlight
+stage
+steam locomotive
+through arch bridge
+steel drum
+stethoscope
+scarf
+stone wall
+stopwatch
+stove
+strainer
+tram
+stretcher
+couch
+stupa
+submarine
+suit
+sundial
+sunglasses
+sunglasses
+sunscreen
+suspension bridge
+mop
+sweatshirt
+swim trunks / shorts
+swing
+electrical switch
+syringe
+table lamp
+tank
+tape player
+teapot
+teddy bear
+television
+tennis ball
+thatched roof
+front curtain
+thimble
+threshing machine
+throne
+tile roof
+toaster
+tobacco shop
+toilet seat
+torch
+totem pole
+tow truck
+toy store
+tractor
+semi-trailer truck
+tray
+trench coat
+tricycle
+trimaran
+tripod
+triumphal arch
+trolleybus
+trombone
+hot tub
+turnstile
+typewriter keyboard
+umbrella
+unicycle
+upright piano
+vacuum cleaner
+vase
+vaulted or arched ceiling
+velvet fabric
+vending machine
+vestment
+viaduct
+violin
+volleyball
+waffle iron
+wall clock
+wallet
+wardrobe
+military aircraft
+sink
+washing machine
+water bottle
+water jug
+water tower
+whiskey jug
+whistle
+hair wig
+window screen
+window shade
+Windsor tie
+wine bottle
+airplane wing
+wok
+wooden spoon
+wool
+split-rail fence
+shipwreck
+sailboat
+yurt
+website
+comic book
+crossword
+traffic or street sign
+traffic light
+dust jacket
+menu
+plate
+guacamole
+consomme
+hot pot
+trifle
+ice cream
+popsicle
+baguette
+bagel
+pretzel
+cheeseburger
+hot dog
+mashed potatoes
+cabbage
+broccoli
+cauliflower
+zucchini
+spaghetti squash
+acorn squash
+butternut squash
+cucumber
+artichoke
+bell pepper
+cardoon
+mushroom
+Granny Smith apple
+strawberry
+orange
+lemon
+fig
+pineapple
+banana
+jackfruit
+cherimoya (custard apple)
+pomegranate
+hay
+carbonara
+chocolate syrup
+dough
+meatloaf
+pizza
+pot pie
+burrito
+red wine
+espresso
+tea cup
+eggnog
+mountain
+bubble
+cliff
+coral reef
+geyser
+lakeshore
+promontory
+sandbar
+beach
+valley
+volcano
+baseball player
+bridegroom
+scuba diver
+rapeseed
+daisy
+yellow lady's slipper
+corn
+acorn
+rose hip
+horse chestnut seed
+coral fungus
+agaric
+gyromitra
+stinkhorn mushroom
+earth star fungus
+hen of the woods mushroom
+bolete
+corn cob
+toilet paper
\ No newline at end of file
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/templates/PatchCamelyon_labels.txt b/mteb/tasks/Image/ZeroshotClassification/eng/templates/PatchCamelyon_labels.txt
new file mode 100644
index 0000000000..4446eab039
--- /dev/null
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/templates/PatchCamelyon_labels.txt
@@ -0,0 +1,2 @@
+lymph node
+lymph node containing metastatic tumor tissue
\ No newline at end of file
diff --git a/mteb/tasks/Image/__init__.py b/mteb/tasks/Image/__init__.py
new file mode 100644
index 0000000000..8f1c2d27f7
--- /dev/null
+++ b/mteb/tasks/Image/__init__.py
@@ -0,0 +1,11 @@
+from __future__ import annotations
+
+from .Any2AnyMultiChoice import *
+from .Any2AnyRetrieval import *
+from .Any2TextMultipleChoice import *
+from .Clustering import *
+from .ImageClassification import *
+from .ImageMultilabelClassification import *
+from .ImageTextPairClassification import *
+from .VisualSTS import *
+from .ZeroshotClassification import *
diff --git a/mteb/tasks/Retrieval/eng/BrightRetrieval.py b/mteb/tasks/Retrieval/eng/BrightRetrieval.py
index 4a9b2e743d..393b121f3f 100644
--- a/mteb/tasks/Retrieval/eng/BrightRetrieval.py
+++ b/mteb/tasks/Retrieval/eng/BrightRetrieval.py
@@ -47,8 +47,7 @@ class BrightRetrieval(MultilingualTask, AbsTaskRetrieval):
eval_langs=DOMAINS_langs,
main_score="ndcg_at_10",
date=("2024-03-01", "2024-06-01"),
- form=["written"],
- domains=["Non-fiction"],
+ domains=["Non-fiction", "Written"],
task_subtypes=["Article retrieval"],
license="cc-by-4.0",
socioeconomic_status="low",
diff --git a/mteb/tasks/Retrieval/eng/NQRetrieval.py b/mteb/tasks/Retrieval/eng/NQRetrieval.py
index 85e45e832c..e81018dbc4 100644
--- a/mteb/tasks/Retrieval/eng/NQRetrieval.py
+++ b/mteb/tasks/Retrieval/eng/NQRetrieval.py
@@ -67,4 +67,7 @@ class NQHardNegatives(AbsTaskRetrieval):
and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le
and Slav Petrov},year = {2019},journal = {Transactions of the Association of Computational
Linguistics}}""",
+ prompt={
+ "query": "Given a question, retrieve Wikipedia passages that answer the question"
+ },
)
diff --git a/mteb/tasks/__init__.py b/mteb/tasks/__init__.py
index 8877da707c..e00f091174 100644
--- a/mteb/tasks/__init__.py
+++ b/mteb/tasks/__init__.py
@@ -4,6 +4,14 @@
from .BitextMining import *
from .Classification import *
from .Clustering import *
+from .Image.Any2AnyMultiChoice import *
+from .Image.Any2AnyRetrieval import *
+from .Image.Clustering import *
+from .Image.ImageClassification import *
+from .Image.ImageMultilabelClassification import *
+from .Image.ImageTextPairClassification import *
+from .Image.VisualSTS import *
+from .Image.ZeroshotClassification import *
from .InstructionRetrieval import *
from .MultiLabelClassification import *
from .PairClassification import *
diff --git a/pyproject.toml b/pyproject.toml
index 1ccdad72db..c91d510165 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,6 +40,7 @@ dependencies = [
"typing_extensions>=0.0.0",
"eval_type_backport>=0.0.0",
"polars>=0.20.22",
+ "torchvision>0.0.0",
]
@@ -67,6 +68,7 @@ pylate = ["pylate>=1.1.4"]
bm25s = ["bm25s>=0.2.6", "PyStemmer>=2.2.0.3"]
gritlm = ["gritlm>=1.0.2"]
xformers = ["xformers>=0.0.29"]
+blip2 = ["salesforce-lavis>=1.0.2"]
[tool.coverage.report]
diff --git a/scripts/data/flickr30k/build_flickr_30k_i2t.py b/scripts/data/flickr30k/build_flickr_30k_i2t.py
new file mode 100644
index 0000000000..2abd1f1682
--- /dev/null
+++ b/scripts/data/flickr30k/build_flickr_30k_i2t.py
@@ -0,0 +1,52 @@
+from __future__ import annotations
+
+import os
+
+from datasets import Dataset, DatasetDict, load_dataset
+from tqdm import tqdm
+
+WRITE_TOK = os.environ["HF_TOKEN"]
+
+eval_split = "test"
+data_raw = load_dataset("clip-benchmark/wds_flickr30k")[eval_split]
+
+
+## i2t
+queries_ = {"id": [], "modality": [], "image": []}
+corpus_ = {"id": [], "modality": [], "text": []}
+relevant_docs_ = {"query-id": [], "corpus-id": [], "score": []}
+
+for row in tqdm(data_raw, total=len(data_raw)):
+ image = row["jpg"]
+ texts = row["txt"].split("\n")
+ key = row["__key__"]
+ query_id = f"q_{key}"
+ queries_["id"].append(query_id)
+ queries_["image"].append(image)
+ queries_["modality"].append("image")
+
+ for i, text in enumerate(texts):
+ doc_id = f"d_{key}_{i}"
+ corpus_["id"].append(doc_id)
+ corpus_["text"].append(text)
+ corpus_["modality"].append("text")
+
+ relevant_docs_["query-id"].append(query_id)
+ relevant_docs_["corpus-id"].append(doc_id)
+ relevant_docs_["score"].append(1)
+
+corpus = Dataset.from_dict(corpus_)
+queries = Dataset.from_dict(queries_)
+relevant_docs = Dataset.from_dict(relevant_docs_)
+
+corpus = DatasetDict({"corpus": corpus})
+queries = DatasetDict({"test": queries})
+relevant_docs = DatasetDict({"test": relevant_docs})
+
+
+repo_name = "isaacchung/flickr30ki2t"
+# create_repo(repo_name, repo_type="dataset", token=WRITE_TOK)
+
+corpus.push_to_hub(repo_name, "corpus", token=WRITE_TOK)
+queries.push_to_hub(repo_name, "query", token=WRITE_TOK)
+relevant_docs.push_to_hub(repo_name, "qrels", token=WRITE_TOK)
diff --git a/scripts/data/flickr30k/build_flickr_30k_t2i.py b/scripts/data/flickr30k/build_flickr_30k_t2i.py
new file mode 100644
index 0000000000..b000de6e4d
--- /dev/null
+++ b/scripts/data/flickr30k/build_flickr_30k_t2i.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+import os
+
+from datasets import Dataset, DatasetDict, load_dataset
+from huggingface_hub import create_repo
+from tqdm import tqdm
+
+WRITE_TOK = os.environ["HF_TOKEN"]
+
+eval_split = "test"
+data_raw = load_dataset("clip-benchmark/wds_flickr30k")[eval_split]
+
+
+## t2i
+queries_ = {"id": [], "modality": [], "text": []}
+corpus_ = {"id": [], "modality": [], "image": []}
+relevant_docs_ = {"query-id": [], "corpus-id": [], "score": []}
+
+for row in tqdm(data_raw, total=len(data_raw)):
+ image = row["jpg"]
+ texts = row["txt"].split("\n")
+ key = row["__key__"]
+
+ doc_id = f"d_{key}"
+ corpus_["id"].append(doc_id)
+ corpus_["image"].append(image)
+ corpus_["modality"].append("image")
+
+ for i, text in enumerate(texts):
+ query_id = f"q_{key}_{i}"
+ queries_["id"].append(query_id)
+ queries_["text"].append(text)
+ queries_["modality"].append("text")
+
+ relevant_docs_["query-id"].append(query_id)
+ relevant_docs_["corpus-id"].append(doc_id)
+ relevant_docs_["score"].append(1)
+
+corpus = Dataset.from_dict(corpus_)
+queries = Dataset.from_dict(queries_)
+relevant_docs = Dataset.from_dict(relevant_docs_)
+
+corpus = DatasetDict({"corpus": corpus})
+queries = DatasetDict({"test": queries})
+relevant_docs = DatasetDict({"test": relevant_docs})
+
+
+repo_name = "isaacchung/flickr30kt2i"
+create_repo(repo_name, repo_type="dataset", token=WRITE_TOK)
+
+corpus.push_to_hub(repo_name, "corpus", token=WRITE_TOK)
+queries.push_to_hub(repo_name, "query", token=WRITE_TOK)
+relevant_docs.push_to_hub(repo_name, "qrels", token=WRITE_TOK)
diff --git a/scripts/extract_model_names.py b/scripts/extract_model_names.py
index 6cbaa2c298..84a81fca26 100644
--- a/scripts/extract_model_names.py
+++ b/scripts/extract_model_names.py
@@ -28,6 +28,7 @@ def get_changed_files(base_branch="main"):
and f.endswith(".py")
and "overview" not in f
and "init" not in f
+ and "instructions" not in f
]
@@ -48,14 +49,26 @@ def extract_model_names(
and isinstance(node.value.func, ast.Name)
and node.value.func.id == "ModelMeta"
):
- model_name = next(
- (
- kw.value.value
- for kw in node.value.keywords
- if kw.arg == "name"
- ),
- None,
- )
+ try:
+ model_name = next(
+ (
+ kw.value.value
+ for kw in node.value.keywords
+ if kw.arg == "name"
+ ),
+ None,
+ )
+ except AttributeError:
+ # For cases where name is assigned a variable and not a direct string,
+ # e.g. in gme_v_models.py: `name=HF_GME_QWEN2VL_2B`
+ model_name = next(
+ (
+ kw.value.id
+ for kw in node.value.keywords
+ if kw.arg == "name"
+ ),
+ None,
+ )
if model_name:
model_names.append(model_name)
first_model_found = True
diff --git a/scripts/run_mieb.py b/scripts/run_mieb.py
new file mode 100644
index 0000000000..b3c55b26d5
--- /dev/null
+++ b/scripts/run_mieb.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+import mteb
+
+for model_name in [
+ "openai/clip-vit-base-patch32",
+ "openai/clip-vit-base-patch16",
+ "openai/clip-vit-large-patch14",
+ "royokong/e5-v",
+ "BAAI/bge-visualized-base",
+ "BAAI/bge-visualized-m3",
+ "kakaobrain/align-base",
+ "jinaai/jina-clip-v1",
+ "nomic-ai/nomic-embed-vision-v1.5",
+ "Salesforce/blip-image-captioning-large",
+ "Salesforce/blip-image-captioning-base",
+ "Salesforce/blip2-opt-2.7b",
+ "Salesforce/blip2-opt-6.7b-coco",
+ "facebook/dinov2-small",
+ "facebook/dinov2-base",
+ "facebook/dinov2-large",
+ "facebook/dinov2-giant",
+ "laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K",
+ "laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K",
+ "laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K",
+ "nyu-visionx/moco-v3-vit-b",
+ "nyu-visionx/moco-v3-vit-l",
+ "google/siglip-so400m-patch14-224",
+ "google/siglip-so400m-patch14-384",
+ "google/siglip-so400m-patch16-256-i18n",
+ "google/siglip-base-patch16-256-multilingual",
+ "google/siglip-base-patch16-256",
+ "google/siglip-base-patch16-512",
+ "google/siglip-base-patch16-384",
+ "google/siglip-base-patch16-224",
+ "google/siglip-large-patch16-256",
+ "google/siglip-large-patch16-384",
+ "laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K",
+ "laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K",
+ "laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K",
+ "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
+ "laion/CLIP-ViT-g-14-laion2B-s34B-b88K",
+ "laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
+ "laion/CLIP-ViT-L-14-laion2B-s32B-b82K",
+ "laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
+ "TIGER-Lab/VLM2Vec-LoRA",
+ "TIGER-Lab/VLM2Vec-Full",
+ "Salesforce/blip-itm-base-coco",
+ "Salesforce/blip-itm-large-coco",
+ "Salesforce/blip-itm-base-flickr",
+ "Salesforce/blip-itm-large-flickr",
+ "EVA02-CLIP-B-16",
+ "EVA02-CLIP-L-14",
+ "EVA02-CLIP-bigE-14",
+ "EVA02-CLIP-bigE-14-plus",
+ # "embed-english-v3.0-v", # not feasible to run due to the 40 images/min constraint
+]:
+ model = mteb.get_model(model_name)
+ tasks = mteb.get_tasks(
+ task_types=[
+ "Any2AnyRetrieval",
+ "Any2AnyMultiChoice",
+ "Any2TextMutipleChoice",
+ "ImageClustering",
+ "ImageClassification",
+ "ImageMultilabelClassification",
+ "ImageTextPairClassification",
+ "VisualSTS",
+ "ZeroShotClassification",
+ ]
+ )
+ # get i-only tasks for i-only models.
+ if ("moco" in model_name) or ("dinov2" in model_name):
+ tasks = [task for task in tasks if "t" not in task.metadata.category]
+
+ evaluation = mteb.MTEB(tasks=tasks)
+ results = evaluation.run(model, output_folder="results-mieb-final")
diff --git a/scripts/run_mieb_get_params.py b/scripts/run_mieb_get_params.py
new file mode 100644
index 0000000000..a28a35ef65
--- /dev/null
+++ b/scripts/run_mieb_get_params.py
@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+import pandas as pd
+import torch
+from tqdm import tqdm
+
+import mteb
+
+params = []
+
+# add all model names
+model_names = [ # "google/siglip-base-patch16-512",
+ "google/siglip-so400m-patch14-384"
+ # ...
+]
+
+for model_name in tqdm(model_names):
+ model = mteb.get_model(model_name)
+
+ total_params = sum(p.numel() for p in model.model.parameters())
+ total_params = total_params / 1e6
+ params.append([model_name, total_params])
+
+ del model
+ torch.cuda.empty_cache()
+
+param_frame = pd.DataFrame(params, columns=["model name", "# params"])
+param_frame.to_csv("params.csv", index=False)
diff --git a/scripts/run_mieb_kshot_ablation.py b/scripts/run_mieb_kshot_ablation.py
new file mode 100644
index 0000000000..a277c0d787
--- /dev/null
+++ b/scripts/run_mieb_kshot_ablation.py
@@ -0,0 +1,71 @@
+from __future__ import annotations
+
+import mteb
+
+for model_name in [
+ # key ones for this ablation (different types of models)
+ "openai/clip-vit-base-patch32",
+ "openai/clip-vit-base-patch16",
+ "openai/clip-vit-large-patch14",
+ "royokong/e5-v",
+ "facebook/dinov2-small",
+ "facebook/dinov2-base",
+ "facebook/dinov2-large",
+ "facebook/dinov2-giant",
+ # more insights
+ "BAAI/bge-visualized-base",
+ "BAAI/bge-visualized-m3",
+ "google/siglip-so400m-patch14-384",
+ "google/siglip-base-patch16-256-multilingual",
+ "google/siglip-base-patch16-256",
+ "google/siglip-base-patch16-512",
+ "google/siglip-base-patch16-384",
+ "google/siglip-base-patch16-224",
+ "google/siglip-large-patch16-256",
+ "google/siglip-large-patch16-384",
+ "nyu-visionx/moco-v3-vit-b",
+ "nyu-visionx/moco-v3-vit-l",
+ "laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K",
+ "laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K",
+ "laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K",
+ "laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K",
+ "laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K",
+ "laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K",
+ "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
+ "laion/CLIP-ViT-g-14-laion2B-s34B-b88K",
+ "laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
+ "laion/CLIP-ViT-L-14-laion2B-s32B-b82K",
+ "laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
+ "EVA02-CLIP-B-16",
+ "EVA02-CLIP-L-14",
+ "EVA02-CLIP-bigE-14",
+ "EVA02-CLIP-bigE-14-plus",
+ "TIGER-Lab/VLM2Vec-LoRA",
+ "TIGER-Lab/VLM2Vec-Full",
+ # run if enough compute:
+ # "Salesforce/blip-itm-base-coco",
+ # "Salesforce/blip-itm-large-coco",
+ # "Salesforce/blip-itm-base-flickr",
+ # "Salesforce/blip-itm-large-flickr",
+ # "kakaobrain/align-base",
+ # "jinaai/jina-clip-v1",
+ # "nomic-ai/nomic-embed-vision-v1.5",
+ # "Salesforce/blip2-opt-2.7b",
+ # "Salesforce/blip2-opt-6.7b-coco",
+ # "embed-english-v3.0-v", # not feasible to run due to the 40 images/min constraint
+]:
+ # 16 by default already
+
+ for k_shot in [8, 32, 64, 128, 256]:
+ model = mteb.get_model(model_name)
+ tasks = mteb.get_tasks(
+ task_types=[
+ "ImageClassification",
+ ]
+ )
+ for task in tasks:
+ task.samples_per_label = k_shot
+ evaluation = mteb.MTEB(tasks=tasks)
+ results = evaluation.run(
+ model, output_folder=f"results-mieb-final/linear_probe_{k_shot}"
+ )
diff --git a/scripts/run_mieb_missed_results.py b/scripts/run_mieb_missed_results.py
new file mode 100644
index 0000000000..5c92289f61
--- /dev/null
+++ b/scripts/run_mieb_missed_results.py
@@ -0,0 +1,92 @@
+from __future__ import annotations
+
+import mteb
+
+# rerun
+for model_name in [
+ "openai/clip-vit-base-patch32",
+ "openai/clip-vit-base-patch16",
+ "openai/clip-vit-large-patch14",
+ "royokong/e5-v",
+ "BAAI/bge-visualized-base",
+ "BAAI/bge-visualized-m3",
+ "kakaobrain/align-base",
+ "jinaai/jina-clip-v1",
+ "nomic-ai/nomic-embed-vision-v1.5",
+ # "Salesforce/blip-image-captioning-large",
+ # "Salesforce/blip-image-captioning-base",
+ "Salesforce/blip2-opt-2.7b",
+ "Salesforce/blip2-opt-6.7b-coco",
+ "facebook/dinov2-small",
+ "facebook/dinov2-base",
+ "facebook/dinov2-large",
+ "facebook/dinov2-giant",
+ "laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K",
+ "laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K",
+ "laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K",
+ "nyu-visionx/moco-v3-vit-b",
+ "nyu-visionx/moco-v3-vit-l",
+ "google/siglip-so400m-patch14-384",
+ "google/siglip-base-patch16-256-multilingual",
+ "google/siglip-base-patch16-256",
+ "google/siglip-base-patch16-512",
+ "google/siglip-base-patch16-384",
+ "google/siglip-base-patch16-224",
+ "google/siglip-large-patch16-256",
+ "google/siglip-large-patch16-384",
+ "laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K",
+ "laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K",
+ "laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K",
+ "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
+ "laion/CLIP-ViT-g-14-laion2B-s34B-b88K",
+ "laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
+ "laion/CLIP-ViT-L-14-laion2B-s32B-b82K",
+ "laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
+ "TIGER-Lab/VLM2Vec-LoRA",
+ "TIGER-Lab/VLM2Vec-Full",
+ "Salesforce/blip-itm-base-coco",
+ "Salesforce/blip-itm-large-coco",
+ "Salesforce/blip-itm-base-flickr",
+ "Salesforce/blip-itm-large-flickr",
+ "EVA02-CLIP-B-16",
+ "EVA02-CLIP-L-14",
+ "EVA02-CLIP-bigE-14",
+ "EVA02-CLIP-bigE-14-plus",
+]:
+ model = mteb.get_model(model_name)
+ tasks = mteb.get_tasks(
+ tasks=[
+ "ROxfordEasyI2IMultiChoice",
+ "ROxfordHardI2IMultiChoice",
+ "ROxfordMediumI2IMultiChoice",
+ "RParisEasyI2IMultiChoice",
+ "RParisHardI2IMultiChoice",
+ "RParisMediumI2IMultiChoice",
+ "BLINKIT2IRetrieval",
+ "BLINKIT2TRetrieval",
+ "BLINKIT2IMultiChoice",
+ "BLINKIT2TMultiChoice",
+ "Flickr30kI2TRetrieval",
+ "Flickr30kT2IRetrieval",
+ ]
+ )
+ # get i-only tasks for i-only models.
+ if ("moco" in model_name) or ("dinov2" in model_name):
+ tasks = [task for task in tasks if "t" not in task.metadata.category]
+
+ evaluation = mteb.MTEB(tasks=tasks)
+ results = evaluation.run(model, output_folder="results-mieb-rerun2")
+
+# # missing task
+model_name = "TIGER-Lab/VLM2Vec-Full"
+model = mteb.get_model(model_name)
+tasks = mteb.get_tasks(
+ tasks=[
+ "CVBenchCount",
+ "CVBenchDepth",
+ "CVBenchDistance",
+ "CVBenchRelation",
+ ]
+)
+evaluation = mteb.MTEB(tasks=tasks)
+results = evaluation.run(model, output_folder="results-mieb-rerun2")
diff --git a/scripts/run_mieb_rerun_siglip.py b/scripts/run_mieb_rerun_siglip.py
new file mode 100644
index 0000000000..539a31e2e7
--- /dev/null
+++ b/scripts/run_mieb_rerun_siglip.py
@@ -0,0 +1,30 @@
+from __future__ import annotations
+
+import mteb
+
+for model_name in [
+ "google/siglip-so400m-patch14-384",
+ "google/siglip-base-patch16-256-multilingual",
+ "google/siglip-base-patch16-256",
+ "google/siglip-base-patch16-512",
+ "google/siglip-base-patch16-384",
+ "google/siglip-base-patch16-224",
+ "google/siglip-large-patch16-256",
+ "google/siglip-large-patch16-384",
+]:
+ model = mteb.get_model(model_name)
+ tasks = mteb.get_tasks(
+ task_types=[
+ "Any2AnyRetrieval",
+ "Any2AnyMultiChoice",
+ "Any2TextMutipleChoice",
+ "ImageClustering",
+ "ImageClassification",
+ "ImageMultilabelClassification",
+ "ImageTextPairClassification",
+ # "VisualSTS", # visual sts does not need rerun as will be the same after fixed.
+ "ZeroShotClassification",
+ ]
+ )
+ evaluation = mteb.MTEB(tasks=tasks)
+ results = evaluation.run(model, output_folder="results-mieb-final/siglip_rerun")
diff --git a/scripts/run_mteb_bright.py b/scripts/run_mteb_bright.py
new file mode 100644
index 0000000000..e6420bb9e7
--- /dev/null
+++ b/scripts/run_mteb_bright.py
@@ -0,0 +1,11 @@
+from __future__ import annotations
+
+from sentence_transformers import SentenceTransformer
+
+from mteb import MTEB
+from mteb.tasks.Retrieval.eng.BrightRetrieval import BrightRetrieval
+
+# testing the task with a model:
+model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
+evaluation = MTEB(tasks=[BrightRetrieval()])
+evaluation.run(model, output_folder="results")
diff --git a/tests/test_benchmark/mock_models.py b/tests/test_benchmark/mock_models.py
index 6b26cf67d4..1043c791f6 100644
--- a/tests/test_benchmark/mock_models.py
+++ b/tests/test_benchmark/mock_models.py
@@ -10,6 +10,7 @@
from numpy import ndarray
from sentence_transformers import CrossEncoder, SentenceTransformer
from torch import Tensor
+from torch.utils.data import DataLoader
import mteb
from mteb import SentenceTransformerWrapper
@@ -41,6 +42,30 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs):
return torch.randn(len(sentences), 10, dtype=torch.bfloat16)
+class MockCLIPEncoder:
+ def __init__(self):
+ pass
+
+ def get_text_embeddings(self, texts, **kwargs):
+ return torch.randn(len(texts), 10)
+
+ def get_image_embeddings(self, images, **kwargs):
+ if isinstance(images, DataLoader):
+ all_embeddings = []
+ for batch in images:
+ batch_embeddings = torch.randn(len(batch), 10)
+ all_embeddings.append(batch_embeddings)
+ return torch.cat(all_embeddings, dim=0)
+ else:
+ return torch.randn(len(images), 10)
+
+ def get_fused_embeddings(self, texts, images, **kwargs):
+ return torch.randn(len(texts), 10)
+
+ def calculate_probs(self, text_embeddings, image_embeddings):
+ return torch.randn(image_embeddings.shape[0], text_embeddings.shape[0])
+
+
class MockSentenceTransformer(SentenceTransformer):
"""A mock implementation of the SentenceTransformer intended to implement just the encode, method using the same arguments."""
diff --git a/tests/test_benchmark/mock_tasks.py b/tests/test_benchmark/mock_tasks.py
index 7d3d2d7528..eea73b2e69 100644
--- a/tests/test_benchmark/mock_tasks.py
+++ b/tests/test_benchmark/mock_tasks.py
@@ -2,7 +2,9 @@
from __future__ import annotations
+import numpy as np
from datasets import Dataset, DatasetDict
+from PIL import Image
from mteb.abstasks import MultilingualTask
from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining
@@ -18,6 +20,23 @@
from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from mteb.abstasks.AbsTaskSTS import AbsTaskSTS
from mteb.abstasks.AbsTaskSummarization import AbsTaskSummarization
+from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import AbsTaskAny2AnyMultiChoice
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.Image.AbsTaskAny2TextMultipleChoice import (
+ AbsTaskAny2TextMultipleChoice,
+)
+from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.Image.AbsTaskImageClustering import AbsTaskImageClustering
+from mteb.abstasks.Image.AbsTaskImageMultilabelClassification import ( # noqa
+ AbsTaskImageMultilabelClassification,
+)
+from mteb.abstasks.Image.AbsTaskImageTextPairClassification import (
+ AbsTaskImageTextPairClassification,
+)
+from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS
+from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
+ AbsTaskZeroshotClassification,
+)
from mteb.abstasks.TaskMetadata import TaskMetadata
general_args = {
@@ -2002,3 +2021,889 @@ def load_data(self, **kwargs):
"fra": short_instructions,
}
self.data_loaded = True
+
+
+class MockMultiChoiceTask(AbsTaskAny2AnyMultiChoice):
+ metadata = TaskMetadata(
+ type="Any2AnyMultiChoice",
+ name="MockMultiChoice",
+ main_score="accuracy",
+ descriptive_stats={
+ "test": {
+ "num_samples": 2,
+ "average_question_length": 26.0,
+ "average_choice_length": 30.5,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 1}, "0": {"count": 1}},
+ }
+ },
+ **general_args, # type: ignore
+ )
+ metadata.modalities = ["image", "text"]
+ metadata.category = "it2i"
+
+ def load_data(self, **kwargs):
+ images = [np.random.randint(0, 255, (100, 100, 3)) for _ in range(2)]
+ images = [
+ Image.fromarray(image.astype("uint8")).convert("RGBA") for image in images
+ ]
+
+ self.corpus = {
+ "test": Dataset.from_dict(
+ {
+ "id": ["d1", "d2"],
+ "image": [images[i] for i in range(2)],
+ "modality": ["image" for _ in range(2)],
+ }
+ )
+ }
+
+ self.queries = {
+ "test": Dataset.from_dict(
+ {
+ "id": [f"q{i}" for i in range(2)],
+ "image": [images[i] for i in range(2)],
+ "text": [
+ "This is a positive sentence",
+ "This is another positive sentence",
+ ],
+ "modality": ["image,text" for _ in range(2)],
+ }
+ )
+ }
+
+ self.relevant_docs = {
+ "test": {
+ "q0": {"d1": 1, "d2": 0},
+ "q1": {"d1": 0, "d2": 1},
+ },
+ }
+ self.data_loaded = True
+
+
+class MockMultilingualMultiChoiceTask(AbsTaskAny2AnyMultiChoice, MultilingualTask):
+ metadata = TaskMetadata(
+ type="Any2AnyMultiChoice",
+ name="MockMultilingualMultiChoice",
+ main_score="accuracy",
+ descriptive_stats={
+ "test": {
+ "num_samples": 4,
+ "average_question_length": 26.0,
+ "average_choice_length": 30.5,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 2}, "0": {"count": 2}},
+ "hf_subset_descriptive_stats": {
+ "eng": {
+ "num_samples": 2,
+ "average_question_length": 26.0,
+ "average_choice_length": 30.5,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 1}, "0": {"count": 1}},
+ },
+ "fra": {
+ "num_samples": 2,
+ "average_question_length": 26.0,
+ "average_choice_length": 30.5,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 1}, "0": {"count": 1}},
+ },
+ },
+ }
+ },
+ **general_args, # type: ignore
+ )
+ metadata.eval_langs = multilingual_eval_langs
+ metadata.modalities = ["image", "text"]
+ metadata.category = "it2i"
+
+ def load_data(self, **kwargs):
+ images = [np.random.randint(0, 255, (100, 100, 3)) for _ in range(2)]
+ images = [
+ Image.fromarray(image.astype("uint8")).convert("RGBA") for image in images
+ ]
+
+ corpus = {
+ "test": Dataset.from_dict(
+ {
+ "id": ["d1", "d2"],
+ "image": [images[i] for i in range(2)],
+ "modality": ["image" for _ in range(2)],
+ }
+ )
+ }
+ self.corpus = {
+ "eng": corpus,
+ "fra": corpus,
+ }
+
+ queries = {
+ "test": Dataset.from_dict(
+ {
+ "id": [f"q{i}" for i in range(2)],
+ "image": [images[i] for i in range(2)],
+ "text": [
+ "This is a positive sentence",
+ "This is another positive sentence",
+ ],
+ "modality": ["image,text" for _ in range(2)],
+ }
+ )
+ }
+ self.queries = {
+ "eng": queries,
+ "fra": queries,
+ }
+
+ relevant_docs = {
+ "test": {
+ "q0": {"d1": 1, "d2": 0},
+ "q1": {"d1": 0, "d2": 1},
+ },
+ }
+ self.relevant_docs = {
+ "eng": relevant_docs,
+ "fra": relevant_docs,
+ }
+
+ self.data_loaded = True
+
+
+class MockAny2AnyRetrievalI2TTask(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ type="Any2AnyRetrieval",
+ name="MockAny2AnyRetrievalI2T",
+ main_score="ndcg_at_10",
+ descriptive_stats={
+ "test": {
+ "average_document_length": 30.0,
+ "average_query_length": 26.0,
+ "num_documents": 2,
+ "num_queries": 2,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ **general_args, # type: ignore
+ )
+ metadata.modalities = ["image", "text"]
+ metadata.category = "i2t"
+
+ def load_data(self, **kwargs):
+ images = [np.random.randint(0, 255, (100, 100, 3)) for _ in range(2)]
+ images = [
+ Image.fromarray(image.astype("uint8")).convert("RGBA") for image in images
+ ]
+
+ self.queries = {
+ "test": Dataset.from_dict(
+ {
+ "id": [f"q{i}" for i in range(2)],
+ "image": [images[i] for i in range(2)],
+ "modality": ["image" for _ in range(2)],
+ }
+ )
+ }
+ self.corpus = {
+ "test": Dataset.from_dict(
+ {
+ "id": ["d1", "d2"],
+ "text": [
+ "This is a positive sentence",
+ "This is another positive sentence",
+ ],
+ "modality": ["text" for _ in range(2)],
+ }
+ )
+ }
+
+ self.relevant_docs = {
+ "test": {
+ "q0": {"d1": 1, "d2": 0},
+ "q1": {"d1": 0, "d2": 1},
+ },
+ }
+ self.data_loaded = True
+
+
+class MockAny2AnyRetrievalT2ITask(AbsTaskAny2AnyRetrieval):
+ metadata = TaskMetadata(
+ type="Any2AnyRetrieval",
+ name="MockAny2AnyRetrievalT2I",
+ main_score="ndcg_at_10",
+ descriptive_stats={
+ "test": {
+ "average_document_length": 30.0,
+ "average_query_length": 26.0,
+ "num_documents": 2,
+ "num_queries": 2,
+ "average_relevant_docs_per_query": 1.0,
+ }
+ },
+ **general_args, # type: ignore
+ )
+ metadata.modalities = ["image", "text"]
+ metadata.category = "t2i"
+
+ def load_data(self, **kwargs):
+ images = [np.random.randint(0, 255, (100, 100, 3)) for _ in range(2)]
+ images = [
+ Image.fromarray(image.astype("uint8")).convert("RGBA") for image in images
+ ]
+
+ self.queries = {
+ "test": Dataset.from_dict(
+ {
+ "id": [f"q{i}" for i in range(2)],
+ "text": [
+ "This is a positive sentence",
+ "This is another positive sentence",
+ ],
+ "modality": ["text" for _ in range(2)],
+ }
+ )
+ }
+ self.corpus = {
+ "test": Dataset.from_dict(
+ {
+ "id": ["d1", "d2"],
+ "image": [images[i] for i in range(2)],
+ "modality": ["image" for _ in range(2)],
+ }
+ )
+ }
+
+ self.relevant_docs = {
+ "test": {
+ "q0": {"d1": 1, "d2": 0},
+ "q1": {"d1": 0, "d2": 1},
+ },
+ }
+ self.data_loaded = True
+
+
+class MockTextMultipleChoiceTask(AbsTaskAny2TextMultipleChoice):
+ metadata = TaskMetadata(
+ type="Any2TextMutipleChoice",
+ name="MockTextMultipleChoice",
+ main_score="accuracy",
+ descriptive_stats={
+ "test": {
+ # TODO: Add descriptive stats
+ }
+ },
+ **general_args, # type: ignore
+ )
+ metadata.modalities = ["text", "image"]
+ metadata.category = "it2i"
+
+ def load_data(self, **kwargs):
+ images = [np.random.randint(0, 255, (100, 100, 3)) for _ in range(2)]
+ images = [
+ Image.fromarray(image.astype("uint8")).convert("RGBA") for image in images
+ ]
+
+ self.dataset = DatasetDict(
+ {
+ "test": Dataset.from_dict(
+ {
+ "id": [f"q{i}" for i in range(2)],
+ "image": [images[i] for i in range(2)],
+ "question": [
+ "This is a positive sentence",
+ "This is another positive sentence",
+ ],
+ "choices": [["3", "2", "1", "0"], ["3", "2", "1", "0"]],
+ "answer": ["1", "0"],
+ }
+ )
+ }
+ )
+
+
+class MockImageClassificationTask(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ type="ImageClassification",
+ name="MockImageClassification",
+ main_score="accuracy",
+ descriptive_stats={
+ "test": {
+ "num_samples": 2,
+ "average_image_size": 26.0,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 1}, "0": {"count": 1}},
+ },
+ "train": {
+ "num_samples": 10,
+ "average_image_size": 26.0,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 5}, "0": {"count": 5}},
+ },
+ },
+ **general_args, # type: ignore
+ )
+ metadata.modalities = ["image"]
+ metadata.category = "i2i"
+
+ def __init__(self, **kwargs):
+ super().__init__(n_experiments=1, samples_per_label=5, **kwargs)
+
+ def load_data(self, **kwargs):
+ images = [np.random.randint(0, 255, (100, 100, 3)) for _ in range(2)]
+ images = [
+ Image.fromarray(image.astype("uint8")).convert("RGBA") for image in images
+ ]
+ labels = [1, 0]
+
+ self.dataset = DatasetDict(
+ {
+ "test": Dataset.from_dict(
+ {
+ "image": images,
+ "label": labels,
+ }
+ ),
+ "train": Dataset.from_dict(
+ {
+ "image": images * 5,
+ "label": labels * 5,
+ }
+ ),
+ }
+ )
+ self.data_loaded = True
+
+
+class MockImageClassificationKNNPTTask(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ type="ImageClassification",
+ name="MockImageClassificationKNNPT",
+ main_score="accuracy",
+ descriptive_stats={
+ "test": {
+ "num_samples": 2,
+ "average_image_size": 26.0,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 1}, "0": {"count": 1}},
+ },
+ "train": {
+ "num_samples": 10,
+ "average_image_size": 26.0,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 5}, "0": {"count": 5}},
+ },
+ },
+ **general_args, # type: ignore
+ )
+ metadata.modalities = ["image"]
+ metadata.category = "i2i"
+
+ def __init__(self, **kwargs):
+ super().__init__(
+ method="kNN-pytorch", n_experiments=1, samples_per_label=5, **kwargs
+ )
+
+ def load_data(self, **kwargs):
+ images = [np.random.randint(0, 255, (100, 100, 3)) for _ in range(2)]
+ images = [
+ Image.fromarray(image.astype("uint8")).convert("RGBA") for image in images
+ ]
+ labels = [1, 0]
+
+ self.dataset = DatasetDict(
+ {
+ "test": Dataset.from_dict(
+ {
+ "image": images,
+ "label": labels,
+ }
+ ),
+ "train": Dataset.from_dict(
+ {
+ "image": images * 5,
+ "label": labels * 5,
+ }
+ ),
+ }
+ )
+ self.data_loaded = True
+
+
+class MockImageClassificationKNNTask(AbsTaskImageClassification):
+ metadata = TaskMetadata(
+ type="ImageClassification",
+ name="MockImageClassificationKNN",
+ main_score="accuracy",
+ descriptive_stats={
+ "test": {
+ "num_samples": 2,
+ "average_image_size": 26.0,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 1}, "0": {"count": 1}},
+ },
+ "train": {
+ "num_samples": 10,
+ "average_image_size": 26.0,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 5}, "0": {"count": 5}},
+ },
+ },
+ **general_args, # type: ignore
+ )
+ metadata.modalities = ["image"]
+ metadata.category = "i2i"
+
+ def __init__(self, **kwargs):
+ super().__init__(method="kNN", n_experiments=1, samples_per_label=5, **kwargs)
+
+ def load_data(self, **kwargs):
+ images = [np.random.randint(0, 255, (100, 100, 3)) for _ in range(2)]
+ images = [
+ Image.fromarray(image.astype("uint8")).convert("RGBA") for image in images
+ ]
+ labels = [1, 0]
+
+ self.dataset = DatasetDict(
+ {
+ "test": Dataset.from_dict(
+ {
+ "image": images,
+ "label": labels,
+ }
+ ),
+ "train": Dataset.from_dict(
+ {
+ "image": images * 5,
+ "label": labels * 5,
+ }
+ ),
+ }
+ )
+ self.data_loaded = True
+
+
+class MockMultilingualImageClassificationTask(
+ AbsTaskImageClassification, MultilingualTask
+):
+ n_experiments = 1
+ samples_per_label = 5
+ metadata = TaskMetadata(
+ type="ImageClassification",
+ name="MockMultilingualImageClassification",
+ main_score="accuracy",
+ descriptive_stats={
+ "test": {
+ "num_samples": 4,
+ "average_image_size": 26.0,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 2}, "0": {"count": 2}},
+ "hf_subset_descriptive_stats": {
+ "eng": {
+ "num_samples": 2,
+ "average_image_size": 26.0,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 1}, "0": {"count": 1}},
+ },
+ "fra": {
+ "num_samples": 2,
+ "average_image_size": 26.0,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 1}, "0": {"count": 1}},
+ },
+ },
+ },
+ "train": {
+ "num_samples": 20,
+ "average_image_size": 26.0,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 10}, "0": {"count": 10}},
+ "hf_subset_descriptive_stats": {
+ "eng": {
+ "num_samples": 10,
+ "average_image_size": 26.0,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 5}, "0": {"count": 5}},
+ },
+ "fra": {
+ "num_samples": 10,
+ "average_image_size": 26.0,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 5}, "0": {"count": 5}},
+ },
+ },
+ },
+ },
+ **general_args, # type: ignore
+ )
+ metadata.modalities = ["image"]
+ metadata.category = "i2i"
+ metadata.eval_langs = multilingual_eval_langs
+
+ def load_data(self, **kwargs):
+ images = [np.random.randint(0, 255, (100, 100, 3)) for _ in range(2)]
+ images = [
+ Image.fromarray(image.astype("uint8")).convert("RGBA") for image in images
+ ]
+ labels = [1, 0]
+ data = {
+ "test": Dataset.from_dict(
+ {
+ "image": images,
+ "label": labels,
+ }
+ ),
+ "train": Dataset.from_dict(
+ {
+ "image": images * 5,
+ "label": labels * 5,
+ }
+ ),
+ }
+
+ self.dataset = DatasetDict(
+ {
+ "eng": data,
+ "fra": data,
+ }
+ )
+ self.data_loaded = True
+
+
+class MockImageClusteringTask(AbsTaskImageClustering):
+ metadata = TaskMetadata(
+ type="ImageClustering",
+ name="MockImageClustering",
+ main_score="nmi",
+ descriptive_stats={
+ "test": {
+ "num_samples": 2,
+ "average_image_size": 26.0,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 1}, "0": {"count": 1}},
+ }
+ },
+ **general_args, # type: ignore
+ )
+ metadata.modalities = ["image"]
+
+ def load_data(self, **kwargs):
+ images = [np.random.randint(0, 255, (100, 100, 3)) for _ in range(2)]
+ images = [
+ Image.fromarray(image.astype("uint8")).convert("RGBA") for image in images
+ ]
+ labels = [1, 0]
+
+ self.dataset = DatasetDict(
+ {
+ "test": Dataset.from_dict(
+ {
+ "image": images,
+ "label": labels,
+ }
+ ),
+ }
+ )
+ self.data_loaded = True
+
+
+class MockImageMultilabelClassificationTask(AbsTaskImageMultilabelClassification):
+ metadata = TaskMetadata(
+ type="ImageMultilabelClassification",
+ name="MockImageMultilabelClassification",
+ main_score="accuracy",
+ descriptive_stats={
+ "test": {
+ "average_image_size": 26.0,
+ "average_label_per_image": 2.0,
+ "num_samples": 6,
+ "unique_labels": 2,
+ "labels": {"0": {"count": 6}, "1": {"count": 6}},
+ }
+ },
+ **general_args, # type: ignore
+ )
+ metadata.modalities = ["image"]
+ metadata.category = "i2i"
+ n_experiments = 1
+ samples_per_label = 3
+
+ def load_data(self, **kwargs):
+ images = [np.random.randint(0, 255, (100, 100, 3)) for _ in range(2)]
+ images = [
+ Image.fromarray(image.astype("uint8")).convert("RGBA") for image in images
+ ]
+ labels = [["0", "3"], ["1", "2"]]
+
+ self.dataset = DatasetDict(
+ {
+ "test": Dataset.from_dict(
+ {
+ "image": images * 2,
+ "labels": labels * 2,
+ }
+ ),
+ "train": Dataset.from_dict(
+ {
+ "image": images * 5,
+ "labels": labels * 5,
+ }
+ ),
+ }
+ )
+ self.data_loaded = True
+
+
+class MockMultilingualImageMultilabelClassificationTask(
+ AbsTaskImageMultilabelClassification, MultilingualTask
+):
+ metadata = TaskMetadata(
+ type="ImageMultilabelClassification",
+ name="MockMultilingualImageMultilabelClassification",
+ main_score="accuracy",
+ descriptive_stats={
+ "test": {
+ "average_image_size": 26.0,
+ "average_label_per_image": 2.0,
+ "num_samples": 12,
+ "unique_labels": 2,
+ "labels": {"0": {"count": 12}, "1": {"count": 12}},
+ "hf_subset_descriptive_stats": {
+ "eng": {
+ "average_image_size": 26.0,
+ "average_label_per_image": 2.0,
+ "num_samples": 6,
+ "unique_labels": 2,
+ "labels": {"0": {"count": 6}, "1": {"count": 6}},
+ },
+ "fra": {
+ "average_image_size": 26.0,
+ "average_label_per_image": 2.0,
+ "num_samples": 6,
+ "unique_labels": 2,
+ "labels": {"0": {"count": 6}, "1": {"count": 6}},
+ },
+ },
+ }
+ },
+ **general_args, # type: ignore
+ )
+ metadata.modalities = ["image"]
+ metadata.eval_langs = multilingual_eval_langs
+
+ def load_data(self, **kwargs):
+ images = [np.random.randint(0, 255, (100, 100, 3)) for _ in range(2)]
+ images = [
+ Image.fromarray(image.astype("uint8")).convert("RGBA") for image in images
+ ]
+ labels = [["0", "3"], ["1", "2"]]
+
+ data = {
+ "test": Dataset.from_dict(
+ {
+ "image": images * 2,
+ "labels": labels * 2,
+ }
+ ),
+ "train": Dataset.from_dict(
+ {
+ "image": images * 5,
+ "labels": labels * 5,
+ }
+ ),
+ }
+
+ self.dataset = DatasetDict(
+ {
+ "eng": data,
+ "fra": data,
+ }
+ )
+ self.data_loaded = True
+
+
+class MockImageTextPairClassificationTask(AbsTaskImageTextPairClassification):
+ metadata = TaskMetadata(
+ type="ImageTextPairClassification",
+ name="MockImageTextPairClassification",
+ main_score="text_acc",
+ descriptive_stats={
+ "test": {
+ "average_image_size": 26.0,
+ "average_text_length": 30.0,
+ "num_samples": 2,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 1}, "0": {"count": 1}},
+ }
+ },
+ **general_args, # type: ignore
+ )
+ metadata.modalities = ["image", "text"]
+ metadata.category = "i2t"
+
+ def load_data(self, **kwargs):
+ images = [np.random.randint(0, 255, (100, 100, 3)) for _ in range(2)]
+ images = [
+ [Image.fromarray(image.astype("uint8")).convert("RGBA") for image in images]
+ ]
+ texts = [["This is a test sentence", "This is another test sentence"]]
+
+ self.dataset = DatasetDict(
+ {
+ "test": Dataset.from_dict(
+ {
+ "image": images,
+ "caption": texts,
+ }
+ ),
+ }
+ )
+ self.data_loaded = True
+
+
+class MockMultilingualImageTextPairClassificationTask(
+ AbsTaskImageTextPairClassification, MultilingualTask
+):
+ metadata = TaskMetadata(
+ type="ImageTextPairClassification",
+ name="MockMultilingualImageTextPairClassification",
+ main_score="accuracy",
+ descriptive_stats={
+ "test": {
+ "average_image_size": 26.0,
+ "average_text_length": 30.0,
+ "num_samples": 4,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 2}, "0": {"count": 2}},
+ "hf_subset_descriptive_stats": {
+ "eng": {
+ "average_image_size": 26.0,
+ "average_text_length": 30.0,
+ "num_samples": 2,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 1}, "0": {"count": 1}},
+ },
+ "fra": {
+ "average_image_size": 26.0,
+ "average_text_length": 30.0,
+ "num_samples": 2,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 1}, "0": {"count": 1}},
+ },
+ },
+ }
+ },
+ **general_args, # type: ignore
+ )
+ metadata.modalities = ["image", "text"]
+ metadata.category = "i2t"
+
+ metadata.eval_langs = multilingual_eval_langs
+
+ def load_data(self, **kwargs):
+ images = [np.random.randint(0, 255, (100, 100, 3)) for _ in range(2)]
+ images = [
+ [Image.fromarray(image.astype("uint8")).convert("RGBA") for image in images]
+ ]
+ texts = [["This is a test sentence", "This is another test sentence"]]
+ data = {
+ "test": Dataset.from_dict(
+ {
+ "image": images,
+ "caption": texts,
+ }
+ ),
+ }
+
+ self.dataset = DatasetDict(
+ {
+ "eng": data,
+ "fra": data,
+ }
+ )
+ self.data_loaded = True
+
+
+class MockVisualSTSTask(AbsTaskVisualSTS):
+ metadata = TaskMetadata(
+ type="VisualSTS",
+ name="MockVisualSTS",
+ main_score="cosine_spearman",
+ descriptive_stats={
+ "test": {
+ "average_image_size": 26.0,
+ "average_text_length": 30.0,
+ "num_samples": 2,
+ "average_score": 0.5,
+ }
+ },
+ **general_args, # type: ignore
+ )
+ metadata.modalities = ["image", "text"]
+ metadata.category = "i2i"
+
+ def load_data(self, **kwargs):
+ images = [np.random.randint(0, 255, (100, 100, 3)) for _ in range(2)]
+ images = [
+ Image.fromarray(image.astype("uint8")).convert("RGBA") for image in images
+ ]
+ scores = [0.5, 0.5]
+
+ self.dataset = DatasetDict(
+ {
+ "test": Dataset.from_dict(
+ {
+ "sentence1": images,
+ "sentence2": images,
+ "score": scores,
+ }
+ ),
+ }
+ )
+ self.data_loaded = True
+
+ @property
+ def metadata_dict(self) -> dict[str, str]:
+ metadata_dict = super().metadata_dict
+ metadata_dict["min_score"] = 0
+ metadata_dict["max_score"] = 5
+ return metadata_dict
+
+
+class MockZeroshotClassificationTask(AbsTaskZeroshotClassification):
+ metadata = TaskMetadata(
+ type="ZeroShotClassification",
+ name="MockZeroshotClassification",
+ main_score="accuracy",
+ descriptive_stats={
+ "test": {
+ "average_text_length": 26.0,
+ "num_samples": 2,
+ "unique_labels": 2,
+ "labels": {"1": {"count": 1}, "0": {"count": 1}},
+ }
+ },
+ **general_args, # type: ignore
+ )
+ metadata.modalities = ["image", "text"]
+ metadata.category = "i2t"
+
+ def load_data(self, **kwargs):
+ images = [np.random.randint(0, 255, (100, 100, 3)) for _ in range(2)]
+ images = [
+ Image.fromarray(image.astype("uint8")).convert("RGBA") for image in images
+ ]
+ labels = ["label1", "label2"]
+
+ self.dataset = DatasetDict(
+ {
+ "test": Dataset.from_dict(
+ {
+ "image": images,
+ "label": labels,
+ }
+ ),
+ }
+ )
+ self.data_loaded = True
+
+ def get_candidate_labels(self) -> list[str]:
+ return ["This is a test sentence", "This is another test sentence"]
diff --git a/tests/test_benchmark/task_grid.py b/tests/test_benchmark/task_grid.py
index c28ad3ea59..77d8f4c8e7 100644
--- a/tests/test_benchmark/task_grid.py
+++ b/tests/test_benchmark/task_grid.py
@@ -12,17 +12,30 @@
)
from .mock_tasks import (
+ MockAny2AnyRetrievalI2TTask,
+ MockAny2AnyRetrievalT2ITask,
MockBitextMiningTask,
MockClassificationTask,
MockClusteringFastTask,
MockClusteringTask,
+ MockImageClassificationKNNPTTask,
+ MockImageClassificationKNNTask,
+ MockImageClassificationTask,
+ MockImageClusteringTask,
+ MockImageMultilabelClassificationTask,
+ MockImageTextPairClassificationTask,
MockInstructionRetrival,
+ MockMultiChoiceTask,
MockMultilabelClassification,
MockMultilingualBitextMiningTask,
MockMultilingualClassificationTask,
MockMultilingualClusteringFastTask,
MockMultilingualClusteringTask,
+ MockMultilingualImageClassificationTask,
+ MockMultilingualImageMultilabelClassificationTask,
+ MockMultilingualImageTextPairClassificationTask,
MockMultilingualInstructionRetrival,
+ MockMultilingualMultiChoiceTask,
MockMultilingualMultilabelClassification,
MockMultilingualPairClassificationTask,
MockMultilingualParallelBitextMiningTask,
@@ -35,6 +48,9 @@
MockRetrievalTask,
MockSTSTask,
MockSummarizationTask,
+ MockTextMultipleChoiceTask,
+ MockVisualSTSTask,
+ MockZeroshotClassificationTask,
)
twenty_news = TwentyNewsgroupsClusteringFast()
@@ -99,3 +115,30 @@
]
MOCK_TASK_REGISTRY = {task.metadata.name: type(task) for task in MOCK_TASK_TEST_GRID}
+
+MOCK_MIEB_TASK_GRID = [
+ MockAny2AnyRetrievalI2TTask(),
+ MockAny2AnyRetrievalT2ITask(),
+ MockTextMultipleChoiceTask(),
+ MockMultiChoiceTask(),
+ MockImageClassificationTask(),
+ MockImageClassificationKNNPTTask(),
+ MockImageClassificationKNNTask(),
+ MockImageClusteringTask(),
+ MockImageTextPairClassificationTask(),
+ MockVisualSTSTask(),
+ MockZeroshotClassificationTask(),
+ MockImageMultilabelClassificationTask(),
+ MockMultilingualImageClassificationTask(),
+ MockMultilingualImageTextPairClassificationTask(),
+ MockMultilingualMultiChoiceTask(),
+ MockMultilingualImageMultilabelClassificationTask(),
+]
+
+MOCK_MIEB_TASK_GRID_AS_STRING = [
+ t.metadata.name if isinstance(t, AbsTask) else t for t in MOCK_MIEB_TASK_GRID
+]
+
+MOCK_MIEB_TASK_REGISTRY = {
+ task.metadata.name: type(task) for task in MOCK_MIEB_TASK_GRID
+}
diff --git a/tests/test_benchmark/test_benchmark_integration_with_datasets.py b/tests/test_benchmark/test_benchmark_integration_with_datasets.py
index 252b9ff3c6..31a4b3d30c 100644
--- a/tests/test_benchmark/test_benchmark_integration_with_datasets.py
+++ b/tests/test_benchmark/test_benchmark_integration_with_datasets.py
@@ -1,4 +1,4 @@
-"""test mteb.MTEB's integration with SentenceTransformers"""
+"""test mteb.MTEB's integration with datasets"""
from __future__ import annotations
diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py
index 5e310dfbb6..7a87914f0a 100644
--- a/tests/test_tasks/test_all_abstasks.py
+++ b/tests/test_tasks/test_all_abstasks.py
@@ -14,16 +14,21 @@
from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from mteb.abstasks.AbsTaskSpeedTask import AbsTaskSpeedTask
from mteb.abstasks.aggregated_task import AbsTaskAggregate
+from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import AbsTaskAny2AnyMultiChoice
+from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
from mteb.abstasks.MultiSubsetLoader import MultiSubsetLoader
from mteb.overview import TASKS_REGISTRY
-from ..test_benchmark.task_grid import MOCK_TASK_TEST_GRID_AS_STRING
+from ..test_benchmark.task_grid import (
+ MOCK_MIEB_TASK_GRID_AS_STRING,
+ MOCK_TASK_TEST_GRID_AS_STRING,
+)
logging.basicConfig(level=logging.INFO)
-tasks = [
- t for t in MTEB().tasks_cls if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING
-]
+ALL_MOCK_TASKS = MOCK_TASK_TEST_GRID_AS_STRING + MOCK_MIEB_TASK_GRID_AS_STRING
+
+tasks = [t for t in MTEB().tasks_cls if t.metadata.name not in ALL_MOCK_TASKS]
@pytest.mark.parametrize("task", tasks)
@@ -35,9 +40,11 @@ def test_load_data(
# TODO: We skip because this load_data is completely different.
if (
isinstance(task, AbsTaskRetrieval)
+ or isinstance(task, AbsTaskAny2AnyRetrieval)
or isinstance(task, AbsTaskInstructionRetrieval)
or isinstance(task, MultiSubsetLoader)
or isinstance(task, AbsTaskSpeedTask)
+ or isinstance(task, AbsTaskAny2AnyMultiChoice)
):
pytest.skip()
with patch.object(task, "dataset_transform") as mock_dataset_transform:
@@ -93,7 +100,7 @@ def test_dataset_availability():
t
for t in tasks
if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING
- if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING
+ if t.metadata.name not in MOCK_MIEB_TASK_GRID_AS_STRING
and t.metadata.name
!= "AfriSentiLangClassification" # HOTFIX: Issue#1777. Remove this line when issue is resolved.
]
diff --git a/tests/test_tasks/test_mieb_datasets.py b/tests/test_tasks/test_mieb_datasets.py
new file mode 100644
index 0000000000..26e60931ec
--- /dev/null
+++ b/tests/test_tasks/test_mieb_datasets.py
@@ -0,0 +1,24 @@
+"""test mteb.MTEB's integration with datasets"""
+
+from __future__ import annotations
+
+import logging
+
+import pytest
+
+import mteb
+from mteb import MTEB
+from mteb.abstasks import AbsTask
+
+from ..test_benchmark.mock_models import MockCLIPEncoder
+from ..test_benchmark.task_grid import MOCK_MIEB_TASK_GRID
+
+logging.basicConfig(level=logging.INFO)
+
+
+@pytest.mark.parametrize("task", MOCK_MIEB_TASK_GRID)
+@pytest.mark.parametrize("model", [MockCLIPEncoder()])
+def test_benchmark_sentence_transformer(task: str | AbsTask, model: mteb.Encoder):
+ """Test that a task can be fetched and run"""
+ eval = MTEB(tasks=[task])
+ eval.run(model, output_folder="tests/results", overwrite_results=True)