diff --git a/.github/workflows/dataset_loading.yml b/.github/workflows/dataset_loading.yml index bf3dc50676..3f8b4b040f 100644 --- a/.github/workflows/dataset_loading.yml +++ b/.github/workflows/dataset_loading.yml @@ -1,7 +1,6 @@ name: Datasets available on HuggingFace on: - pull_request: push: branches: [main] @@ -21,7 +20,8 @@ jobs: - name: Install dependencies run: | - make install-for-tests + make install-for-tests + - name: Run dataset loading tests run: | make dataset-load-test diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 65bcde1ba9..1400c284e1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,6 +24,13 @@ jobs: steps: - uses: actions/checkout@v3 + - name: Cache Hugging Face + id: cache-hf + uses: actions/cache@v4 + with: + path: ~/.cache/huggingface + key: ${{ runner.os }}-hf + - name: Setup Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: diff --git a/Makefile b/Makefile index 463b66548d..e49a63193d 100644 --- a/Makefile +++ b/Makefile @@ -46,7 +46,7 @@ serve-docs: model-load-test: @echo "--- 🚀 Running model load test ---" - pip install ".[dev, speedtask, pylate,gritlm,xformers,model2vec]" + pip install ".[dev, pylate,gritlm,xformers,model2vec]" python scripts/extract_model_names.py $(BASE_BRANCH) --return_one_model_name_per_file python tests/test_models/model_loading.py --model_name_file scripts/model_names.txt diff --git a/README.md b/README.md index ebcc3e1a26..8f8342d0f9 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,6 @@ mteb run -m sentence-transformers/all-MiniLM-L6-v2 \ Note that using multiple GPUs in parallel can be done by just having a custom encode function that distributes the inputs to multiple GPUs like e.g. [here](https://github.com/microsoft/unilm/blob/b60c741f746877293bb85eed6806736fc8fa0ffd/e5/mteb_eval.py#L60) or [here](https://github.com/ContextualAI/gritlm/blob/09d8630f0c95ac6a456354bcb6f964d7b9b6a609/gritlm/gritlm.py#L75). See [custom models](docs/usage/usage.md#using-a-custom-model) for more information. - ## Usage Documentation The following links to the main sections in the usage documentation. @@ -102,16 +101,16 @@ The following links to the main sections in the usage documentation. ## Overview -| Overview | | +| Overview | | |--------------------------------|-------------------------------------------------------------------------------------| | 📈 [Leaderboard] | The interactive leaderboard of the benchmark | | 📋 [Tasks] | Overview of available tasks | | 📐 [Benchmarks] | Overview of available benchmarks | -| **Contributing** | | -| 🤖 [Adding a model] | Information related to how to submit a model to MTEB and to the leaderboard | -| 👩‍🔬 [Reproducible workflows] | Information related to how to create reproducible workflows with MTEB | -| 👩‍💻 [Adding a dataset] | How to add a new task/dataset to MTEB | -| 👩‍💻 [Adding a benchmark] | How to add a new benchmark to MTEB and to the leaderboard | +| **Contributing** | | +| 🤖 [Adding a model] | Information related to how to submit a model to MTEB and to the leaderboard | +| 👩‍🔬 [Reproducible workflows] | Information related to how to create reproducible workflows with MTEB | +| 👩‍💻 [Adding a dataset] | How to add a new task/dataset to MTEB | +| 👩‍💻 [Adding a benchmark] | How to add a new benchmark to MTEB and to the leaderboard | | 🤝 [Contributing] | How to contribute to MTEB and set it up for development | [Tasks]: docs/tasks.md @@ -125,23 +124,13 @@ The following links to the main sections in the usage documentation. ## Citing -MTEB was introduced in "[MTEB: Massive Text Embedding Benchmark](https://arxiv.org/abs/2210.07316)", and heavily expanded in "[MMTEB: Massive Multilingual Text Embedding Benchmark](https://arxiv.org/abs/2502.13595)". When using `mteb` we recommend that you cite both articles. +MTEB was introduced in "[MTEB: Massive Text Embedding Benchmark](https://arxiv.org/abs/2210.07316)", and heavily expanded in "[MMTEB: Massive Multilingual Text Embedding Benchmark](https://arxiv.org/abs/2502.13595)". When using `mteb`, we recommend that you cite both articles.
Bibtex Citation (click to unfold) ```bibtex -@article{enevoldsen2025mmtebmassivemultilingualtext, - title={MMTEB: Massive Multilingual Text Embedding Benchmark}, - author={Kenneth Enevoldsen and Isaac Chung and Imene Kerboua and Márton Kardos and Ashwin Mathur and David Stap and Jay Gala and Wissam Siblini and Dominik Krzemiński and Genta Indra Winata and Saba Sturua and Saiteja Utpala and Mathieu Ciancone and Marion Schaeffer and Gabriel Sequeira and Diganta Misra and Shreeya Dhakal and Jonathan Rystrøm and Roman Solomatin and Ömer Çağatan and Akash Kundu and Martin Bernstorff and Shitao Xiao and Akshita Sukhlecha and Bhavish Pahwa and Rafał Poświata and Kranthi Kiran GV and Shawon Ashraf and Daniel Auras and Björn Plüster and Jan Philipp Harries and Loïc Magne and Isabelle Mohr and Mariya Hendriksen and Dawei Zhu and Hippolyte Gisserot-Boukhlef and Tom Aarsen and Jan Kostkan and Konrad Wojtasik and Taemin Lee and Marek Šuppa and Crystina Zhang and Roberta Rocca and Mohammed Hamdy and Andrianos Michail and John Yang and Manuel Faysse and Aleksei Vatolin and Nandan Thakur and Manan Dey and Dipam Vasani and Pranjal Chitale and Simone Tedeschi and Nguyen Tai and Artem Snegirev and Michael Günther and Mengzhou Xia and Weijia Shi and Xing Han Lù and Jordan Clive and Gayatri Krishnakumar and Anna Maksimova and Silvan Wehrli and Maria Tikhonova and Henil Panchal and Aleksandr Abramov and Malte Ostendorff and Zheng Liu and Simon Clematide and Lester James Miranda and Alena Fenogenova and Guangyu Song and Ruqiya Bin Safi and Wen-Ding Li and Alessia Borghini and Federico Cassano and Hongjin Su and Jimmy Lin and Howard Yen and Lasse Hansen and Sara Hooker and Chenghao Xiao and Vaibhav Adlakha and Orion Weller and Siva Reddy and Niklas Muennighoff}, - publisher = {arXiv}, - journal={arXiv preprint arXiv:2502.13595}, - year={2025}, - url={https://arxiv.org/abs/2502.13595}, - doi = {10.48550/arXiv.2502.13595}, -} - @article{muennighoff2022mteb, author = {Muennighoff, Niklas and Tazi, Nouamane and Magne, Lo{\"\i}c and Reimers, Nils}, title = {MTEB: Massive Text Embedding Benchmark}, @@ -151,21 +140,31 @@ MTEB was introduced in "[MTEB: Massive Text Embedding Benchmark](https://arxiv.o url = {https://arxiv.org/abs/2210.07316}, doi = {10.48550/ARXIV.2210.07316}, } + +@article{enevoldsen2025mmtebmassivemultilingualtext, + title={MMTEB: Massive Multilingual Text Embedding Benchmark}, + author={Kenneth Enevoldsen and Isaac Chung and Imene Kerboua and Márton Kardos and Ashwin Mathur and David Stap and Jay Gala and Wissam Siblini and Dominik Krzemiński and Genta Indra Winata and Saba Sturua and Saiteja Utpala and Mathieu Ciancone and Marion Schaeffer and Gabriel Sequeira and Diganta Misra and Shreeya Dhakal and Jonathan Rystrøm and Roman Solomatin and Ömer Çağatan and Akash Kundu and Martin Bernstorff and Shitao Xiao and Akshita Sukhlecha and Bhavish Pahwa and Rafał Poświata and Kranthi Kiran GV and Shawon Ashraf and Daniel Auras and Björn Plüster and Jan Philipp Harries and Loïc Magne and Isabelle Mohr and Mariya Hendriksen and Dawei Zhu and Hippolyte Gisserot-Boukhlef and Tom Aarsen and Jan Kostkan and Konrad Wojtasik and Taemin Lee and Marek Šuppa and Crystina Zhang and Roberta Rocca and Mohammed Hamdy and Andrianos Michail and John Yang and Manuel Faysse and Aleksei Vatolin and Nandan Thakur and Manan Dey and Dipam Vasani and Pranjal Chitale and Simone Tedeschi and Nguyen Tai and Artem Snegirev and Michael Günther and Mengzhou Xia and Weijia Shi and Xing Han Lù and Jordan Clive and Gayatri Krishnakumar and Anna Maksimova and Silvan Wehrli and Maria Tikhonova and Henil Panchal and Aleksandr Abramov and Malte Ostendorff and Zheng Liu and Simon Clematide and Lester James Miranda and Alena Fenogenova and Guangyu Song and Ruqiya Bin Safi and Wen-Ding Li and Alessia Borghini and Federico Cassano and Hongjin Su and Jimmy Lin and Howard Yen and Lasse Hansen and Sara Hooker and Chenghao Xiao and Vaibhav Adlakha and Orion Weller and Siva Reddy and Niklas Muennighoff}, + publisher = {arXiv}, + journal={arXiv preprint arXiv:2502.13595}, + year={2025}, + url={https://arxiv.org/abs/2502.13595}, + doi = {10.48550/arXiv.2502.13595}, +} ```
-If you use any of the specific benchmark we also recommend that you cite the authors. +If you use any of the specific benchmarks, we also recommend that you cite the authors. ```py benchmark = mteb.get_benchmark("MTEB(eng, v2)") -benchmark.citation # get citation for a specific benchmarks +benchmark.citation # get citation for a specific benchmark # you can also create a table of the task for the appendix using: benchmark.tasks.to_latex() ``` -Some of these amazing publications include: +Some of these amazing publications include (ordered chronologically): - Shitao Xiao, Zheng Liu, Peitian Zhang, Niklas Muennighoff. "[C-Pack: Packaged Resources To Advance General Chinese Embedding](https://arxiv.org/abs/2309.07597)" arXiv 2023 - Michael Günther, Jackmin Ong, Isabelle Mohr, Alaeddine Abdessalem, Tanguy Abel, Mohammad Kalim Akram, Susana Guzman, Georgios Mastrapas, Saba Sturua, Bo Wang, Maximilian Werk, Nan Wang, Han Xiao. "[Jina Embeddings 2: 8192-Token General-Purpose Text Embeddings for Long Documents](https://arxiv.org/abs/2310.19923)" arXiv 2023 - Silvan Wehrli, Bert Arnrich, Christopher Irrgang. "[German Text Embedding Clustering Benchmark](https://arxiv.org/abs/2401.02709)" arXiv 2024 diff --git a/docs/adding_a_dataset.md b/docs/adding_a_dataset.md index 0b94e7dee6..b517eb19ff 100644 --- a/docs/adding_a_dataset.md +++ b/docs/adding_a_dataset.md @@ -252,7 +252,7 @@ model = SentenceTransformer(model_name) evaluation = MTEB(tasks=[YourNewTask()]) ``` -- [ ] I have run the following models on the task (adding the results to the pr). These can be run using the `mteb -m {model_name} -t {task_name}` command. +- [ ] I have run the following models on the task (adding the results to the pr). These can be run using the `mteb run -m {model_name} -t {task_name}` command. - [ ] `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2` - [ ] `intfloat/multilingual-e5-small` - [ ] I have checked that the performance is neither trivial (both models gain close to perfect scores) nor random (both models gain close to random scores). diff --git a/docs/adding_a_model.md b/docs/adding_a_model.md index 3c764bb1be..0625b4ffd5 100644 --- a/docs/adding_a_model.md +++ b/docs/adding_a_model.md @@ -132,3 +132,17 @@ model = ModelMeta( ... ) ``` + +##### Adding model dependencies in pyproject.toml +If your are adding a model that requires additional dependencies, you can add them to the `pyproject.toml` file and instead of checking whether dependencies are installed or not make use of `requires_package` from [requires_package.py](../mteb/requires_packages.py). For example: + +In the [voyage_models.py](../mteb/models/voyage_models.py) file, we have added the following code: +```python +requires_package(self, "voyageai", model_name, "pip install 'mteb[voyageai]'") +``` +and also updated [pyproject.toml]((../pyproject.toml)) file with the following code: +```python +voyageai = ["voyageai>=1.0.0,<2.0.0"] +``` +so that it will check whether voyageai is installed or not. If not, then it will give an error message to install voyageai. This has done so as to give clear installation warnings. +If you want to give suggestion instead of warning, you can use `suggest_package` from [requires_package.py](../mteb/requires_packages.py). diff --git a/docs/tasks.md b/docs/tasks.md index f5dcc916a5..a73b59ed19 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -44,9 +44,9 @@ The following tables give you an overview of the tasks in MTEB. | [Assin2STS](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) (Real et al., 2020) | ['por'] | STS | s2s | [Written] | None | None | | [AutoRAGRetrieval](https://arxiv.org/abs/2410.20878) (Dongkyu Kim, 2024) | ['kor'] | Retrieval | s2p | [Financial, Government, Legal, Medical, Social] | {'test': 834} | {'test': {'number_of_characters': 894.22, 'num_samples': 834, 'num_queries': 114, 'num_documents': 720, 'average_document_length': 1.15, 'average_query_length': 0.61, 'average_relevant_docs_per_query': 1.0}} | | [BIOSSES](https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html) (Soğancıoğlu et al., 2017) | ['eng'] | STS | s2s | [Medical] | None | None | -| [BLINKIT2IMultiChoice](https://arxiv.org/abs/2404.12390) (Fu et al., 2024) | ['eng'] | VisionCentric | it2i | [Encyclopaedic] | {'test': 1206} | {'test': {'number_of_characters': 21204, 'num_samples': 1206, 'num_queries': 402, 'num_documents': 804, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 83, 'average_document_image_width': 788.44, 'max_document_image_width': 5087, 'min_document_image_height': 127, 'average_document_image_height': 813.95, 'max_document_image_height': 3230, 'num_document_images': 804, 'min_query_length': 51, 'average_query_length': 52.75, 'max_query_length': 57, 'unique_queries': 3, 'num_query_images': 402, 'min_query_image_width': 166, 'average_query_image_width': 815.13, 'max_query_image_width': 2733, 'min_query_image_height': 254, 'average_query_image_height': 875.38, 'max_query_image_height': 5687, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 804}} | +| [BLINKIT2IMultiChoice](https://arxiv.org/abs/2404.12390) (Fu et al., 2024) | ['eng'] | VisionCentricQA | it2i | [Encyclopaedic] | {'test': 1206} | {'test': {'number_of_characters': 21204, 'num_samples': 1206, 'num_queries': 402, 'num_documents': 804, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 83, 'average_document_image_width': 788.44, 'max_document_image_width': 5087, 'min_document_image_height': 127, 'average_document_image_height': 813.95, 'max_document_image_height': 3230, 'num_document_images': 804, 'min_query_length': 51, 'average_query_length': 52.75, 'max_query_length': 57, 'unique_queries': 3, 'num_query_images': 402, 'min_query_image_width': 166, 'average_query_image_width': 815.13, 'max_query_image_width': 2733, 'min_query_image_height': 254, 'average_query_image_height': 875.38, 'max_query_image_height': 5687, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 804}} | | [BLINKIT2IRetrieval](https://arxiv.org/abs/2404.12390) (Fu et al., 2024) | ['eng'] | Any2AnyRetrieval | it2i | [Encyclopaedic] | {'test': 1206} | {'test': {'number_of_characters': 21204, 'num_samples': 1206, 'num_queries': 402, 'num_documents': 804, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 804, 'min_query_length': 51, 'average_query_length': 52.75, 'max_query_length': 57, 'unique_queries': 3, 'num_query_images': 402, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 402}} | -| [BLINKIT2TMultiChoice](https://arxiv.org/abs/2404.12390) (Fu et al., 2024) | ['eng'] | VisionCentric | it2t | [Encyclopaedic] | {'test': 813} | {'test': {'number_of_characters': 54272, 'num_samples': 813, 'num_queries': 793, 'num_documents': 20, 'min_document_length': 1, 'average_document_length': 5.8, 'max_document_length': 14, 'unique_documents': 20, 'min_document_image_width': 0, 'average_document_image_width': 0, 'max_document_image_width': 0, 'min_document_image_height': 0, 'average_document_image_height': 0, 'max_document_image_height': 0, 'num_document_images': 0, 'min_query_length': 22, 'average_query_length': 68.29, 'max_query_length': 135, 'unique_queries': 347, 'num_query_images': 793, 'min_query_image_width': 63, 'average_query_image_width': 515.08, 'max_query_image_width': 4096, 'min_query_image_height': 232, 'average_query_image_height': 722.64, 'max_query_image_height': 3226, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 20}} | +| [BLINKIT2TMultiChoice](https://arxiv.org/abs/2404.12390) (Fu et al., 2024) | ['eng'] | VisionCentricQA | it2t | [Encyclopaedic] | {'test': 813} | {'test': {'number_of_characters': 54272, 'num_samples': 813, 'num_queries': 793, 'num_documents': 20, 'min_document_length': 1, 'average_document_length': 5.8, 'max_document_length': 14, 'unique_documents': 20, 'min_document_image_width': 0, 'average_document_image_width': 0, 'max_document_image_width': 0, 'min_document_image_height': 0, 'average_document_image_height': 0, 'max_document_image_height': 0, 'num_document_images': 0, 'min_query_length': 22, 'average_query_length': 68.29, 'max_query_length': 135, 'unique_queries': 347, 'num_query_images': 793, 'min_query_image_width': 63, 'average_query_image_width': 515.08, 'max_query_image_width': 4096, 'min_query_image_height': 232, 'average_query_image_height': 722.64, 'max_query_image_height': 3226, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 20}} | | [BLINKIT2TRetrieval](https://arxiv.org/abs/2404.12390) (Fu et al., 2024) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | {'test': 813} | {'test': {'number_of_characters': 54272, 'num_samples': 813, 'num_queries': 793, 'num_documents': 20, 'min_document_length': 1, 'average_document_length': 5.8, 'max_document_length': 14, 'unique_documents': 20, 'num_document_images': 0, 'min_query_length': 22, 'average_query_length': 68.29, 'max_query_length': 135, 'unique_queries': 347, 'num_query_images': 793, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 20}} | | [BQ](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None | | [BSARDRetrieval](https://huggingface.co/datasets/maastrichtlawtech/bsard) (Louis et al., 2022) | ['fra'] | Retrieval | s2p | [Legal, Spoken] | None | None | @@ -176,10 +176,10 @@ The following tables give you an overview of the tasks in MTEB. | [CUADWarrantyDurationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [CUB200I2IRetrieval](https://www.florian-schroff.de/publications/CUB-200.pdf) (Welinder et al., 2010) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | {'test': 11588} | {'test': {'number_of_characters': 0, 'num_samples': 11588, 'num_queries': 5794, 'num_documents': 5794, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 5794, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 5794, 'min_relevant_docs_per_query': 10, 'average_relevant_docs_per_query': 28.26, 'max_relevant_docs_per_query': 29, 'unique_relevant_docs': 5794}} | | [CUREv1](https://huggingface.co/datasets/clinia/CUREv1) | ['eng', 'fra', 'spa'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | -| [CVBenchCount](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | VisionCentric | it2t | [Academic] | {'test': 788} | {'test': {'num_samples': 788, 'min_image_width': 200, 'average_image_width': 757.68, 'max_image_width': 2200, 'min_image_height': 181, 'average_image_height': 631.31, 'max_image_height': 2200, 'min_num_choices': 4, 'average_num_choices': 4.55, 'max_num_choices': 6, 'min_question_length': 30, 'average_question_length': 34.35, 'max_question_length': 45, 'answers': {'2': {'count': 169}, '4': {'count': 63}, '3': {'count': 167}, '1': {'count': 184}, '0': {'count': 182}, '5': {'count': 23}}}} | -| [CVBenchDepth](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | VisionCentric | it2t | [Academic] | {'test': 600} | {'test': {'num_samples': 600, 'min_image_width': 561, 'average_image_width': 1090.96, 'max_image_width': 1600, 'min_image_height': 427, 'average_image_height': 715.99, 'max_image_height': 900, 'min_num_choices': 2, 'average_num_choices': 2.0, 'max_num_choices': 2, 'min_question_length': 130, 'average_question_length': 136.04, 'max_question_length': 147, 'answers': {'0': {'count': 300}, '1': {'count': 300}}}} | -| [CVBenchDistance](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | VisionCentric | it2t | [Academic] | {'test': 600} | {'test': {'num_samples': 600, 'min_image_width': 561, 'average_image_width': 1099.29, 'max_image_width': 1600, 'min_image_height': 427, 'average_image_height': 721.0, 'max_image_height': 900, 'min_num_choices': 2, 'average_num_choices': 2.0, 'max_num_choices': 2, 'min_question_length': 204, 'average_question_length': 212.4, 'max_question_length': 223, 'answers': {'0': {'count': 303}, '1': {'count': 297}}}} | -| [CVBenchRelation](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | VisionCentric | it2t | [Academic] | {'test': 650} | {'test': {'num_samples': 650, 'min_image_width': 189, 'average_image_width': 546.32, 'max_image_width': 2200, 'min_image_height': 190, 'average_image_height': 448.45, 'max_image_height': 2200, 'min_num_choices': 2, 'average_num_choices': 2.0, 'max_num_choices': 2, 'min_question_length': 132, 'average_question_length': 181.46, 'max_question_length': 224, 'answers': {'0': {'count': 327}, '1': {'count': 323}}}} | +| [CVBenchCount](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | VisionCentricQA | it2t | [Academic] | {'test': 805} | {'test': {'number_of_characters': 27095, 'num_samples': 805, 'num_queries': 788, 'num_documents': 17, 'min_document_length': 1, 'average_document_length': 1.41, 'max_document_length': 2, 'unique_documents': 17, 'min_document_image_width': 0, 'average_document_image_width': 0, 'max_document_image_width': 0, 'min_document_image_height': 0, 'average_document_image_height': 0, 'max_document_image_height': 0, 'num_document_images': 0, 'min_query_length': 30, 'average_query_length': 34.35, 'max_query_length': 45, 'unique_queries': 197, 'num_query_images': 788, 'min_query_image_width': 181, 'average_query_image_width': 631.31, 'max_query_image_width': 2200, 'min_query_image_height': 200, 'average_query_image_height': 757.68, 'max_query_image_height': 2200, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 17}} | +| [CVBenchDepth](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | VisionCentricQA | it2t | [Academic] | {'test': 669} | {'test': {'number_of_characters': 82092, 'num_samples': 669, 'num_queries': 600, 'num_documents': 69, 'min_document_length': 3, 'average_document_length': 6.75, 'max_document_length': 17, 'unique_documents': 69, 'min_document_image_width': 0, 'average_document_image_width': 0, 'max_document_image_width': 0, 'min_document_image_height': 0, 'average_document_image_height': 0, 'max_document_image_height': 0, 'num_document_images': 0, 'min_query_length': 130, 'average_query_length': 136.04, 'max_query_length': 147, 'unique_queries': 279, 'num_query_images': 600, 'min_query_image_width': 427, 'average_query_image_width': 715.99, 'max_query_image_width': 900, 'min_query_image_height': 561, 'average_query_image_height': 1090.96, 'max_query_image_height': 1600, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 69}} | +| [CVBenchDistance](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | VisionCentricQA | it2t | [Academic] | {'test': 656} | {'test': {'number_of_characters': 127804, 'num_samples': 656, 'num_queries': 600, 'num_documents': 56, 'min_document_length': 3, 'average_document_length': 6.46, 'max_document_length': 12, 'unique_documents': 56, 'min_document_image_width': 0, 'average_document_image_width': 0, 'max_document_image_width': 0, 'min_document_image_height': 0, 'average_document_image_height': 0, 'max_document_image_height': 0, 'num_document_images': 0, 'min_query_length': 204, 'average_query_length': 212.4, 'max_query_length': 223, 'unique_queries': 381, 'num_query_images': 600, 'min_query_image_width': 427, 'average_query_image_width': 721.0, 'max_query_image_width': 900, 'min_query_image_height': 561, 'average_query_image_height': 1099.29, 'max_query_image_height': 1600, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 56}} | +| [CVBenchRelation](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | VisionCentricQA | it2t | [Academic] | {'test': 654} | {'test': {'number_of_characters': 117967, 'num_samples': 654, 'num_queries': 650, 'num_documents': 4, 'min_document_length': 4, 'average_document_length': 4.75, 'max_document_length': 5, 'unique_documents': 4, 'min_document_image_width': 0, 'average_document_image_width': 0, 'max_document_image_width': 0, 'min_document_image_height': 0, 'average_document_image_height': 0, 'max_document_image_height': 0, 'num_document_images': 0, 'min_query_length': 132, 'average_query_length': 181.46, 'max_query_length': 224, 'unique_queries': 580, 'num_query_images': 650, 'min_query_image_width': 190, 'average_query_image_width': 448.45, 'max_query_image_width': 2200, 'min_query_image_height': 189, 'average_query_image_height': 546.32, 'max_query_image_height': 2200, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 4}} | | [Caltech101](https://ieeexplore.ieee.org/document/1384978) (Li Fei-Fei, 2004) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 6084} | {'test': {'num_samples': 6084, 'unique_num_labels': 102, 'min_image_width': 80, 'average_image_width': 311.72, 'max_image_width': 3481, 'min_image_height': 101, 'average_image_height': 241.84, 'max_image_height': 3999, 'labels': {'4': {'count': 437}, '37': {'count': 405}, '38': {'count': 405}, '57': {'count': 170}, '66': {'count': 768}, '0': {'count': 25}, '1': {'count': 770}, '2': {'count': 12}, '3': {'count': 12}, '5': {'count': 17}, '6': {'count': 24}, '7': {'count': 16}, '8': {'count': 3}, '9': {'count': 98}, '10': {'count': 68}, '11': {'count': 13}, '12': {'count': 55}, '13': {'count': 61}, '14': {'count': 20}, '15': {'count': 13}, '16': {'count': 93}, '17': {'count': 17}, '18': {'count': 29}, '19': {'count': 32}, '20': {'count': 77}, '22': {'count': 39}, '23': {'count': 43}, '24': {'count': 40}, '25': {'count': 20}, '26': {'count': 21}, '27': {'count': 27}, '28': {'count': 37}, '29': {'count': 22}, '30': {'count': 35}, '31': {'count': 38}, '32': {'count': 45}, '33': {'count': 34}, '34': {'count': 23}, '35': {'count': 34}, '36': {'count': 55}, '39': {'count': 37}, '40': {'count': 37}, '41': {'count': 15}, '42': {'count': 4}, '43': {'count': 4}, '44': {'count': 21}, '45': {'count': 69}, '46': {'count': 70}, '47': {'count': 12}, '48': {'count': 24}, '49': {'count': 58}, '50': {'count': 50}, '51': {'count': 1}, '52': {'count': 34}, '53': {'count': 56}, '54': {'count': 84}, '55': {'count': 31}, '56': {'count': 51}, '58': {'count': 48}, '59': {'count': 11}, '60': {'count': 36}, '61': {'count': 13}, '62': {'count': 10}, '63': {'count': 57}, '64': {'count': 2}, '65': {'count': 46}, '67': {'count': 25}, '68': {'count': 5}, '69': {'count': 9}, '70': {'count': 17}, '71': {'count': 8}, '72': {'count': 15}, '73': {'count': 23}, '74': {'count': 4}, '75': {'count': 27}, '76': {'count': 52}, '77': {'count': 29}, '78': {'count': 19}, '79': {'count': 10}, '80': {'count': 33}, '81': {'count': 9}, '82': {'count': 54}, '83': {'count': 27}, '84': {'count': 5}, '85': {'count': 34}, '86': {'count': 15}, '87': {'count': 56}, '88': {'count': 29}, '89': {'count': 34}, '90': {'count': 5}, '91': {'count': 55}, '92': {'count': 19}, '93': {'count': 56}, '94': {'count': 45}, '95': {'count': 209}, '96': {'count': 7}, '97': {'count': 29}, '98': {'count': 4}, '99': {'count': 26}, '100': {'count': 9}, '101': {'count': 30}, '21': {'count': 17}}}} | | [Caltech101ZeroShot](https://ieeexplore.ieee.org/document/1384978) (Li Fei-Fei, 2004) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | {'test': 1986} | {'test': {'num_samples': 1986, 'unique_num_labels': 63, 'min_image_width': 105, 'average_image_width': 277.19, 'max_image_width': 300, 'min_image_height': 114, 'average_image_height': 255.33, 'max_image_height': 300, 'min_label_text_length': 17, 'average_label_text_length': 21.88, 'max_label_text_length': 31, 'labels': {'36': {'count': 55}, '39': {'count': 37}, '40': {'count': 37}, '41': {'count': 15}, '42': {'count': 4}, '43': {'count': 4}, '44': {'count': 21}, '45': {'count': 69}, '46': {'count': 70}, '47': {'count': 12}, '48': {'count': 24}, '49': {'count': 58}, '50': {'count': 50}, '51': {'count': 1}, '52': {'count': 34}, '53': {'count': 56}, '54': {'count': 84}, '55': {'count': 31}, '56': {'count': 51}, '58': {'count': 48}, '59': {'count': 11}, '60': {'count': 36}, '61': {'count': 13}, '62': {'count': 10}, '63': {'count': 57}, '64': {'count': 2}, '65': {'count': 46}, '67': {'count': 25}, '68': {'count': 5}, '69': {'count': 9}, '70': {'count': 17}, '71': {'count': 8}, '72': {'count': 15}, '73': {'count': 23}, '74': {'count': 4}, '75': {'count': 27}, '76': {'count': 52}, '77': {'count': 29}, '78': {'count': 19}, '79': {'count': 10}, '80': {'count': 33}, '81': {'count': 9}, '82': {'count': 54}, '83': {'count': 27}, '84': {'count': 5}, '85': {'count': 34}, '86': {'count': 15}, '87': {'count': 56}, '88': {'count': 29}, '89': {'count': 34}, '90': {'count': 5}, '91': {'count': 55}, '92': {'count': 19}, '93': {'count': 56}, '94': {'count': 45}, '95': {'count': 209}, '96': {'count': 7}, '97': {'count': 29}, '98': {'count': 4}, '99': {'count': 26}, '100': {'count': 9}, '101': {'count': 30}, '21': {'count': 17}}}} | | [CanadaTaxCourtOutcomesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | @@ -346,7 +346,7 @@ The following tables give you an overview of the tasks in MTEB. | [IN22ConvBitextMining](https://huggingface.co/datasets/ai4bharat/IN22-Conv) (Jay Gala, 2023) | ['asm', 'ben', 'brx', 'doi', 'eng', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | BitextMining | s2s | [Fiction, Social, Spoken, Spoken] | {'test': 760518} | {'test': {'num_samples': 760518, 'number_of_characters': 82637104, 'unique_pairs': 759283, 'min_sentence1_length': 3, 'average_sentence1_length': 54.33, 'max_sentence1_length': 239, 'unique_sentence1': 34430, 'min_sentence2_length': 3, 'average_sentence2_length': 54.33, 'max_sentence2_length': 239, 'unique_sentence2': 34430, 'hf_subset_descriptive_stats': {'asm_Beng-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155988, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'asm_Beng-brx_Deva': {'num_samples': 1503, 'number_of_characters': 162044, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'asm_Beng-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167032, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'asm_Beng-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160716, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'asm_Beng-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156282, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'asm_Beng-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 158269, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'asm_Beng-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159964, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'asm_Beng-kan_Knda': {'num_samples': 1503, 'number_of_characters': 165177, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'asm_Beng-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164681, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'asm_Beng-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162408, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'asm_Beng-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172838, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'asm_Beng-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162747, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'asm_Beng-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157316, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'asm_Beng-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160906, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'asm_Beng-ory_Orya': {'num_samples': 1503, 'number_of_characters': 164223, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'asm_Beng-pan_Guru': {'num_samples': 1503, 'number_of_characters': 160201, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'asm_Beng-san_Deva': {'num_samples': 1503, 'number_of_characters': 158093, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'asm_Beng-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169379, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'asm_Beng-snd_Deva': {'num_samples': 1503, 'number_of_characters': 162623, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'asm_Beng-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174866, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'asm_Beng-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157690, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'asm_Beng-urd_Arab': {'num_samples': 1503, 'number_of_characters': 161305, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'ben_Beng-asm_Beng': {'num_samples': 1503, 'number_of_characters': 155988, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'ben_Beng-brx_Deva': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'ben_Beng-doi_Deva': {'num_samples': 1503, 'number_of_characters': 161436, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'ben_Beng-eng_Latn': {'num_samples': 1503, 'number_of_characters': 155120, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'ben_Beng-gom_Deva': {'num_samples': 1503, 'number_of_characters': 150686, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'ben_Beng-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 152673, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'ben_Beng-hin_Deva': {'num_samples': 1503, 'number_of_characters': 154368, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'ben_Beng-kan_Knda': {'num_samples': 1503, 'number_of_characters': 159581, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'ben_Beng-kas_Arab': {'num_samples': 1503, 'number_of_characters': 159085, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'ben_Beng-mai_Deva': {'num_samples': 1503, 'number_of_characters': 156812, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'ben_Beng-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 167242, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'ben_Beng-mar_Deva': {'num_samples': 1503, 'number_of_characters': 157151, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'ben_Beng-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 151720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'ben_Beng-npi_Deva': {'num_samples': 1503, 'number_of_characters': 155310, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'ben_Beng-ory_Orya': {'num_samples': 1503, 'number_of_characters': 158627, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'ben_Beng-pan_Guru': {'num_samples': 1503, 'number_of_characters': 154605, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'ben_Beng-san_Deva': {'num_samples': 1503, 'number_of_characters': 152497, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'ben_Beng-sat_Olck': {'num_samples': 1503, 'number_of_characters': 163783, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'ben_Beng-snd_Deva': {'num_samples': 1503, 'number_of_characters': 157027, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'ben_Beng-tam_Taml': {'num_samples': 1503, 'number_of_characters': 169270, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'ben_Beng-tel_Telu': {'num_samples': 1503, 'number_of_characters': 152094, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'ben_Beng-urd_Arab': {'num_samples': 1503, 'number_of_characters': 155709, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'brx_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162044, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'brx_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'brx_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167492, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'brx_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161176, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'brx_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156742, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'brx_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'brx_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 160424, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'brx_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 165637, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'brx_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165141, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'brx_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162868, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'brx_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'brx_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163207, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'brx_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157776, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'brx_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'brx_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 164683, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'brx_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 160661, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'brx_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 158553, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'brx_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169839, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'brx_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163083, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'brx_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175326, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'brx_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158150, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'brx_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 161765, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'doi_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 167032, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'doi_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 161436, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'doi_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 167492, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'doi_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 166164, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'doi_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'doi_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 163717, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'doi_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 165412, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'doi_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 170625, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'doi_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 170129, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'doi_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 167856, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'doi_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 178286, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'doi_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 168195, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'doi_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 162764, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'doi_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 166354, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'doi_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 169671, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'doi_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 165649, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'doi_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 163541, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'doi_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 174827, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'doi_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 168071, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'doi_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 180314, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'doi_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 163138, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'doi_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 166753, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'eng_Latn-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160716, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'eng_Latn-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155120, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'eng_Latn-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161176, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'eng_Latn-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166164, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'eng_Latn-gom_Deva': {'num_samples': 1503, 'number_of_characters': 155414, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'eng_Latn-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157401, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'eng_Latn-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159096, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'eng_Latn-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164309, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'eng_Latn-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163813, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'eng_Latn-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161540, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'eng_Latn-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171970, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'eng_Latn-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161879, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'eng_Latn-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'eng_Latn-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160038, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'eng_Latn-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163355, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'eng_Latn-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159333, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'eng_Latn-san_Deva': {'num_samples': 1503, 'number_of_characters': 157225, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'eng_Latn-sat_Olck': {'num_samples': 1503, 'number_of_characters': 168511, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'eng_Latn-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161755, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'eng_Latn-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173998, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'eng_Latn-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156822, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'eng_Latn-urd_Arab': {'num_samples': 1503, 'number_of_characters': 160437, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'gom_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 156282, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'gom_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 150686, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'gom_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 156742, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'gom_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'gom_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 155414, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'gom_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 152967, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'gom_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 154662, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'gom_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 159875, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'gom_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 159379, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'gom_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 157106, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'gom_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 167536, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'gom_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 157445, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'gom_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 152014, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'gom_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 155604, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'gom_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 158921, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'gom_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 154899, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'gom_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 152791, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'gom_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 164077, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'gom_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 157321, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'gom_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 169564, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'gom_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 152388, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'gom_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 156003, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'guj_Gujr-asm_Beng': {'num_samples': 1503, 'number_of_characters': 158269, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'guj_Gujr-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152673, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'guj_Gujr-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'guj_Gujr-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163717, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'guj_Gujr-eng_Latn': {'num_samples': 1503, 'number_of_characters': 157401, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'guj_Gujr-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152967, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'guj_Gujr-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156649, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'guj_Gujr-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161862, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'guj_Gujr-kas_Arab': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'guj_Gujr-mai_Deva': {'num_samples': 1503, 'number_of_characters': 159093, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'guj_Gujr-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 169523, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'guj_Gujr-mar_Deva': {'num_samples': 1503, 'number_of_characters': 159432, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'guj_Gujr-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 154001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'guj_Gujr-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157591, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'guj_Gujr-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160908, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'guj_Gujr-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156886, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'guj_Gujr-san_Deva': {'num_samples': 1503, 'number_of_characters': 154778, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'guj_Gujr-sat_Olck': {'num_samples': 1503, 'number_of_characters': 166064, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'guj_Gujr-snd_Deva': {'num_samples': 1503, 'number_of_characters': 159308, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'guj_Gujr-tam_Taml': {'num_samples': 1503, 'number_of_characters': 171551, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'guj_Gujr-tel_Telu': {'num_samples': 1503, 'number_of_characters': 154375, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'guj_Gujr-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157990, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'hin_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 159964, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'hin_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 154368, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'hin_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 160424, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'hin_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 165412, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'hin_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 159096, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'hin_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 154662, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'hin_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 156649, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'hin_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 163557, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'hin_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163061, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'hin_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 160788, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'hin_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171218, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'hin_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161127, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'hin_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 155696, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'hin_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 159286, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'hin_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 162603, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'hin_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 158581, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'hin_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 156473, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'hin_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 167759, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'hin_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'hin_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173246, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'hin_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156070, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'hin_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 159685, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'kan_Knda-asm_Beng': {'num_samples': 1503, 'number_of_characters': 165177, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'kan_Knda-ben_Beng': {'num_samples': 1503, 'number_of_characters': 159581, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'kan_Knda-brx_Deva': {'num_samples': 1503, 'number_of_characters': 165637, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'kan_Knda-doi_Deva': {'num_samples': 1503, 'number_of_characters': 170625, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'kan_Knda-eng_Latn': {'num_samples': 1503, 'number_of_characters': 164309, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'kan_Knda-gom_Deva': {'num_samples': 1503, 'number_of_characters': 159875, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'kan_Knda-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 161862, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'kan_Knda-hin_Deva': {'num_samples': 1503, 'number_of_characters': 163557, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'kan_Knda-kas_Arab': {'num_samples': 1503, 'number_of_characters': 168274, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'kan_Knda-mai_Deva': {'num_samples': 1503, 'number_of_characters': 166001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'kan_Knda-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 176431, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'kan_Knda-mar_Deva': {'num_samples': 1503, 'number_of_characters': 166340, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'kan_Knda-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 160909, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'kan_Knda-npi_Deva': {'num_samples': 1503, 'number_of_characters': 164499, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'kan_Knda-ory_Orya': {'num_samples': 1503, 'number_of_characters': 167816, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'kan_Knda-pan_Guru': {'num_samples': 1503, 'number_of_characters': 163794, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'kan_Knda-san_Deva': {'num_samples': 1503, 'number_of_characters': 161686, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'kan_Knda-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172972, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'kan_Knda-snd_Deva': {'num_samples': 1503, 'number_of_characters': 166216, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'kan_Knda-tam_Taml': {'num_samples': 1503, 'number_of_characters': 178459, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'kan_Knda-tel_Telu': {'num_samples': 1503, 'number_of_characters': 161283, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'kan_Knda-urd_Arab': {'num_samples': 1503, 'number_of_characters': 164898, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'kas_Arab-asm_Beng': {'num_samples': 1503, 'number_of_characters': 164681, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'kas_Arab-ben_Beng': {'num_samples': 1503, 'number_of_characters': 159085, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'kas_Arab-brx_Deva': {'num_samples': 1503, 'number_of_characters': 165141, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'kas_Arab-doi_Deva': {'num_samples': 1503, 'number_of_characters': 170129, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'kas_Arab-eng_Latn': {'num_samples': 1503, 'number_of_characters': 163813, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'kas_Arab-gom_Deva': {'num_samples': 1503, 'number_of_characters': 159379, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'kas_Arab-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'kas_Arab-hin_Deva': {'num_samples': 1503, 'number_of_characters': 163061, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'kas_Arab-kan_Knda': {'num_samples': 1503, 'number_of_characters': 168274, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'kas_Arab-mai_Deva': {'num_samples': 1503, 'number_of_characters': 165505, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'kas_Arab-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 175935, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'kas_Arab-mar_Deva': {'num_samples': 1503, 'number_of_characters': 165844, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'kas_Arab-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 160413, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'kas_Arab-npi_Deva': {'num_samples': 1503, 'number_of_characters': 164003, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'kas_Arab-ory_Orya': {'num_samples': 1503, 'number_of_characters': 167320, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'kas_Arab-pan_Guru': {'num_samples': 1503, 'number_of_characters': 163298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'kas_Arab-san_Deva': {'num_samples': 1503, 'number_of_characters': 161190, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'kas_Arab-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172476, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'kas_Arab-snd_Deva': {'num_samples': 1503, 'number_of_characters': 165720, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'kas_Arab-tam_Taml': {'num_samples': 1503, 'number_of_characters': 177963, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'kas_Arab-tel_Telu': {'num_samples': 1503, 'number_of_characters': 160787, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'kas_Arab-urd_Arab': {'num_samples': 1503, 'number_of_characters': 164402, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mai_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162408, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mai_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 156812, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mai_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 162868, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mai_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167856, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mai_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161540, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mai_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157106, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mai_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159093, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mai_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 160788, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mai_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166001, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mai_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165505, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mai_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173662, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mai_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163571, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mai_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158140, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mai_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mai_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165047, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mai_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161025, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mai_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 158917, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mai_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170203, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mai_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163447, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mai_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175690, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mai_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158514, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mai_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162129, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mal_Mlym-asm_Beng': {'num_samples': 1503, 'number_of_characters': 172838, 'unique_pairs': 1498, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mal_Mlym-ben_Beng': {'num_samples': 1503, 'number_of_characters': 167242, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mal_Mlym-brx_Deva': {'num_samples': 1503, 'number_of_characters': 173298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mal_Mlym-doi_Deva': {'num_samples': 1503, 'number_of_characters': 178286, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mal_Mlym-eng_Latn': {'num_samples': 1503, 'number_of_characters': 171970, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mal_Mlym-gom_Deva': {'num_samples': 1503, 'number_of_characters': 167536, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mal_Mlym-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 169523, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mal_Mlym-hin_Deva': {'num_samples': 1503, 'number_of_characters': 171218, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mal_Mlym-kan_Knda': {'num_samples': 1503, 'number_of_characters': 176431, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mal_Mlym-kas_Arab': {'num_samples': 1503, 'number_of_characters': 175935, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mal_Mlym-mai_Deva': {'num_samples': 1503, 'number_of_characters': 173662, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mal_Mlym-mar_Deva': {'num_samples': 1503, 'number_of_characters': 174001, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mal_Mlym-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 168570, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mal_Mlym-npi_Deva': {'num_samples': 1503, 'number_of_characters': 172160, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mal_Mlym-ory_Orya': {'num_samples': 1503, 'number_of_characters': 175477, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mal_Mlym-pan_Guru': {'num_samples': 1503, 'number_of_characters': 171455, 'unique_pairs': 1498, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mal_Mlym-san_Deva': {'num_samples': 1503, 'number_of_characters': 169347, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mal_Mlym-sat_Olck': {'num_samples': 1503, 'number_of_characters': 180633, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mal_Mlym-snd_Deva': {'num_samples': 1503, 'number_of_characters': 173877, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mal_Mlym-tam_Taml': {'num_samples': 1503, 'number_of_characters': 186120, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mal_Mlym-tel_Telu': {'num_samples': 1503, 'number_of_characters': 168944, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mal_Mlym-urd_Arab': {'num_samples': 1503, 'number_of_characters': 172559, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mar_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162747, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mar_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 157151, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mar_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 163207, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mar_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 168195, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mar_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161879, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mar_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157445, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mar_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159432, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mar_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 161127, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mar_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166340, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mar_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165844, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mar_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 163571, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mar_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 174001, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mar_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158479, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mar_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 162069, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mar_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165386, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mar_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161364, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mar_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 159256, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mar_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170542, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mar_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163786, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mar_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 176029, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mar_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158853, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mar_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162468, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mni_Mtei-asm_Beng': {'num_samples': 1503, 'number_of_characters': 157316, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mni_Mtei-ben_Beng': {'num_samples': 1503, 'number_of_characters': 151720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mni_Mtei-brx_Deva': {'num_samples': 1503, 'number_of_characters': 157776, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mni_Mtei-doi_Deva': {'num_samples': 1503, 'number_of_characters': 162764, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mni_Mtei-eng_Latn': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mni_Mtei-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152014, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mni_Mtei-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mni_Mtei-hin_Deva': {'num_samples': 1503, 'number_of_characters': 155696, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mni_Mtei-kan_Knda': {'num_samples': 1503, 'number_of_characters': 160909, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mni_Mtei-kas_Arab': {'num_samples': 1503, 'number_of_characters': 160413, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mni_Mtei-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158140, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mni_Mtei-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 168570, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mni_Mtei-mar_Deva': {'num_samples': 1503, 'number_of_characters': 158479, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mni_Mtei-npi_Deva': {'num_samples': 1503, 'number_of_characters': 156638, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mni_Mtei-ory_Orya': {'num_samples': 1503, 'number_of_characters': 159955, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mni_Mtei-pan_Guru': {'num_samples': 1503, 'number_of_characters': 155933, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mni_Mtei-san_Deva': {'num_samples': 1503, 'number_of_characters': 153825, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mni_Mtei-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165111, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mni_Mtei-snd_Deva': {'num_samples': 1503, 'number_of_characters': 158355, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mni_Mtei-tam_Taml': {'num_samples': 1503, 'number_of_characters': 170598, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mni_Mtei-tel_Telu': {'num_samples': 1503, 'number_of_characters': 153422, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mni_Mtei-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157037, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'npi_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160906, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'npi_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155310, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'npi_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'npi_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166354, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'npi_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160038, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'npi_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 155604, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'npi_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157591, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'npi_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159286, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'npi_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164499, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'npi_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'npi_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'npi_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172160, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'npi_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162069, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'npi_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 156638, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'npi_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163545, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'npi_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159523, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'npi_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 157415, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'npi_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 168701, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'npi_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161945, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'npi_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174188, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'npi_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157012, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'npi_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 160627, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'ory_Orya-asm_Beng': {'num_samples': 1503, 'number_of_characters': 164223, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'ory_Orya-ben_Beng': {'num_samples': 1503, 'number_of_characters': 158627, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'ory_Orya-brx_Deva': {'num_samples': 1503, 'number_of_characters': 164683, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'ory_Orya-doi_Deva': {'num_samples': 1503, 'number_of_characters': 169671, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'ory_Orya-eng_Latn': {'num_samples': 1503, 'number_of_characters': 163355, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'ory_Orya-gom_Deva': {'num_samples': 1503, 'number_of_characters': 158921, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'ory_Orya-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 160908, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'ory_Orya-hin_Deva': {'num_samples': 1503, 'number_of_characters': 162603, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'ory_Orya-kan_Knda': {'num_samples': 1503, 'number_of_characters': 167816, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'ory_Orya-kas_Arab': {'num_samples': 1503, 'number_of_characters': 167320, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'ory_Orya-mai_Deva': {'num_samples': 1503, 'number_of_characters': 165047, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'ory_Orya-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 175477, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'ory_Orya-mar_Deva': {'num_samples': 1503, 'number_of_characters': 165386, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'ory_Orya-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 159955, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'ory_Orya-npi_Deva': {'num_samples': 1503, 'number_of_characters': 163545, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'ory_Orya-pan_Guru': {'num_samples': 1503, 'number_of_characters': 162840, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'ory_Orya-san_Deva': {'num_samples': 1503, 'number_of_characters': 160732, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'ory_Orya-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172018, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'ory_Orya-snd_Deva': {'num_samples': 1503, 'number_of_characters': 165262, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'ory_Orya-tam_Taml': {'num_samples': 1503, 'number_of_characters': 177505, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'ory_Orya-tel_Telu': {'num_samples': 1503, 'number_of_characters': 160329, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'ory_Orya-urd_Arab': {'num_samples': 1503, 'number_of_characters': 163944, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'pan_Guru-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160201, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'pan_Guru-ben_Beng': {'num_samples': 1503, 'number_of_characters': 154605, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'pan_Guru-brx_Deva': {'num_samples': 1503, 'number_of_characters': 160661, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'pan_Guru-doi_Deva': {'num_samples': 1503, 'number_of_characters': 165649, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'pan_Guru-eng_Latn': {'num_samples': 1503, 'number_of_characters': 159333, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'pan_Guru-gom_Deva': {'num_samples': 1503, 'number_of_characters': 154899, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'pan_Guru-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 156886, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'pan_Guru-hin_Deva': {'num_samples': 1503, 'number_of_characters': 158581, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'pan_Guru-kan_Knda': {'num_samples': 1503, 'number_of_characters': 163794, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'pan_Guru-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163298, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'pan_Guru-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161025, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'pan_Guru-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171455, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'pan_Guru-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161364, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'pan_Guru-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 155933, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'pan_Guru-npi_Deva': {'num_samples': 1503, 'number_of_characters': 159523, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'pan_Guru-ory_Orya': {'num_samples': 1503, 'number_of_characters': 162840, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'pan_Guru-san_Deva': {'num_samples': 1503, 'number_of_characters': 156710, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'pan_Guru-sat_Olck': {'num_samples': 1503, 'number_of_characters': 167996, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'pan_Guru-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161240, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'pan_Guru-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173483, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'pan_Guru-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156307, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'pan_Guru-urd_Arab': {'num_samples': 1503, 'number_of_characters': 159922, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'san_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 158093, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'san_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152497, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'san_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158553, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'san_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163541, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'san_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 157225, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'san_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152791, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'san_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154778, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'san_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156473, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'san_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161686, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'san_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 161190, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'san_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158917, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'san_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 169347, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'san_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 159256, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'san_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 153825, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'san_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157415, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'san_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160732, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'san_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156710, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'san_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165888, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'san_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 159132, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'san_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 171375, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'san_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 154199, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'san_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157814, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'sat_Olck-asm_Beng': {'num_samples': 1503, 'number_of_characters': 169379, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'sat_Olck-ben_Beng': {'num_samples': 1503, 'number_of_characters': 163783, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'sat_Olck-brx_Deva': {'num_samples': 1503, 'number_of_characters': 169839, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'sat_Olck-doi_Deva': {'num_samples': 1503, 'number_of_characters': 174827, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'sat_Olck-eng_Latn': {'num_samples': 1503, 'number_of_characters': 168511, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'sat_Olck-gom_Deva': {'num_samples': 1503, 'number_of_characters': 164077, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'sat_Olck-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 166064, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'sat_Olck-hin_Deva': {'num_samples': 1503, 'number_of_characters': 167759, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'sat_Olck-kan_Knda': {'num_samples': 1503, 'number_of_characters': 172972, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'sat_Olck-kas_Arab': {'num_samples': 1503, 'number_of_characters': 172476, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'sat_Olck-mai_Deva': {'num_samples': 1503, 'number_of_characters': 170203, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'sat_Olck-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 180633, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'sat_Olck-mar_Deva': {'num_samples': 1503, 'number_of_characters': 170542, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'sat_Olck-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 165111, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'sat_Olck-npi_Deva': {'num_samples': 1503, 'number_of_characters': 168701, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'sat_Olck-ory_Orya': {'num_samples': 1503, 'number_of_characters': 172018, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'sat_Olck-pan_Guru': {'num_samples': 1503, 'number_of_characters': 167996, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'sat_Olck-san_Deva': {'num_samples': 1503, 'number_of_characters': 165888, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'sat_Olck-snd_Deva': {'num_samples': 1503, 'number_of_characters': 170418, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'sat_Olck-tam_Taml': {'num_samples': 1503, 'number_of_characters': 182661, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'sat_Olck-tel_Telu': {'num_samples': 1503, 'number_of_characters': 165485, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'sat_Olck-urd_Arab': {'num_samples': 1503, 'number_of_characters': 169100, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'snd_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162623, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'snd_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 157027, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'snd_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 163083, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'snd_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 168071, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'snd_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161755, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'snd_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157321, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'snd_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159308, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'snd_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 161003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'snd_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166216, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'snd_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'snd_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 163447, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'snd_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173877, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'snd_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163786, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'snd_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158355, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'snd_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161945, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'snd_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165262, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'snd_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161240, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'snd_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 159132, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'snd_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170418, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'snd_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175905, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'snd_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'snd_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162344, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'tam_Taml-asm_Beng': {'num_samples': 1503, 'number_of_characters': 174866, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'tam_Taml-ben_Beng': {'num_samples': 1503, 'number_of_characters': 169270, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'tam_Taml-brx_Deva': {'num_samples': 1503, 'number_of_characters': 175326, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'tam_Taml-doi_Deva': {'num_samples': 1503, 'number_of_characters': 180314, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'tam_Taml-eng_Latn': {'num_samples': 1503, 'number_of_characters': 173998, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'tam_Taml-gom_Deva': {'num_samples': 1503, 'number_of_characters': 169564, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'tam_Taml-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 171551, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'tam_Taml-hin_Deva': {'num_samples': 1503, 'number_of_characters': 173246, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'tam_Taml-kan_Knda': {'num_samples': 1503, 'number_of_characters': 178459, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'tam_Taml-kas_Arab': {'num_samples': 1503, 'number_of_characters': 177963, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'tam_Taml-mai_Deva': {'num_samples': 1503, 'number_of_characters': 175690, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'tam_Taml-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 186120, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'tam_Taml-mar_Deva': {'num_samples': 1503, 'number_of_characters': 176029, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'tam_Taml-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 170598, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'tam_Taml-npi_Deva': {'num_samples': 1503, 'number_of_characters': 174188, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'tam_Taml-ory_Orya': {'num_samples': 1503, 'number_of_characters': 177505, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'tam_Taml-pan_Guru': {'num_samples': 1503, 'number_of_characters': 173483, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'tam_Taml-san_Deva': {'num_samples': 1503, 'number_of_characters': 171375, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'tam_Taml-sat_Olck': {'num_samples': 1503, 'number_of_characters': 182661, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'tam_Taml-snd_Deva': {'num_samples': 1503, 'number_of_characters': 175905, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'tam_Taml-tel_Telu': {'num_samples': 1503, 'number_of_characters': 170972, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'tam_Taml-urd_Arab': {'num_samples': 1503, 'number_of_characters': 174587, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'tel_Telu-asm_Beng': {'num_samples': 1503, 'number_of_characters': 157690, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'tel_Telu-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152094, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'tel_Telu-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158150, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'tel_Telu-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163138, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'tel_Telu-eng_Latn': {'num_samples': 1503, 'number_of_characters': 156822, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'tel_Telu-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152388, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'tel_Telu-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154375, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'tel_Telu-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156070, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'tel_Telu-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161283, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'tel_Telu-kas_Arab': {'num_samples': 1503, 'number_of_characters': 160787, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'tel_Telu-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158514, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'tel_Telu-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 168944, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'tel_Telu-mar_Deva': {'num_samples': 1503, 'number_of_characters': 158853, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'tel_Telu-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 153422, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'tel_Telu-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157012, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'tel_Telu-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160329, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'tel_Telu-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156307, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'tel_Telu-san_Deva': {'num_samples': 1503, 'number_of_characters': 154199, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'tel_Telu-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165485, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'tel_Telu-snd_Deva': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'tel_Telu-tam_Taml': {'num_samples': 1503, 'number_of_characters': 170972, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'tel_Telu-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157411, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'urd_Arab-asm_Beng': {'num_samples': 1503, 'number_of_characters': 161305, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'urd_Arab-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155709, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'urd_Arab-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161765, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'urd_Arab-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166753, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'urd_Arab-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160437, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'urd_Arab-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'urd_Arab-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157990, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'urd_Arab-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159685, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'urd_Arab-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164898, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'urd_Arab-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164402, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'urd_Arab-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162129, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'urd_Arab-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172559, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'urd_Arab-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162468, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'urd_Arab-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157037, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'urd_Arab-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160627, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'urd_Arab-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163944, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'urd_Arab-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159922, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'urd_Arab-san_Deva': {'num_samples': 1503, 'number_of_characters': 157814, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'urd_Arab-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169100, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'urd_Arab-snd_Deva': {'num_samples': 1503, 'number_of_characters': 162344, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'urd_Arab-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174587, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'urd_Arab-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157411, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}}}} | | [IN22GenBitextMining](https://huggingface.co/datasets/ai4bharat/IN22-Gen) (Jay Gala, 2023) | ['asm', 'ben', 'brx', 'doi', 'eng', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | BitextMining | s2s | [Government, Legal, News, Non-fiction, Religious, Web, Written] | {'test': 518144} | {'test': {'num_samples': 518144, 'number_of_characters': 162367876, 'unique_pairs': 518101, 'min_sentence1_length': 9, 'average_sentence1_length': 156.68, 'max_sentence1_length': 692, 'unique_sentence1': 23550, 'min_sentence2_length': 9, 'average_sentence2_length': 156.68, 'max_sentence2_length': 692, 'unique_sentence2': 23550, 'hf_subset_descriptive_stats': {'asm_Beng-ben_Beng': {'num_samples': 1024, 'number_of_characters': 310622, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'asm_Beng-brx_Deva': {'num_samples': 1024, 'number_of_characters': 323609, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'asm_Beng-doi_Deva': {'num_samples': 1024, 'number_of_characters': 319020, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'asm_Beng-eng_Latn': {'num_samples': 1024, 'number_of_characters': 320098, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'asm_Beng-gom_Deva': {'num_samples': 1024, 'number_of_characters': 312594, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'asm_Beng-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 309440, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'asm_Beng-hin_Deva': {'num_samples': 1024, 'number_of_characters': 320106, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'asm_Beng-kan_Knda': {'num_samples': 1024, 'number_of_characters': 332064, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'asm_Beng-kas_Arab': {'num_samples': 1024, 'number_of_characters': 322764, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'asm_Beng-mai_Deva': {'num_samples': 1024, 'number_of_characters': 308682, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'asm_Beng-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 343636, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'asm_Beng-mar_Deva': {'num_samples': 1024, 'number_of_characters': 321784, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'asm_Beng-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 313134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'asm_Beng-npi_Deva': {'num_samples': 1024, 'number_of_characters': 313419, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'asm_Beng-ory_Orya': {'num_samples': 1024, 'number_of_characters': 334226, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'asm_Beng-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306863, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'asm_Beng-san_Deva': {'num_samples': 1024, 'number_of_characters': 318079, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'asm_Beng-sat_Olck': {'num_samples': 1024, 'number_of_characters': 326732, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'asm_Beng-snd_Deva': {'num_samples': 1024, 'number_of_characters': 320421, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'asm_Beng-tam_Taml': {'num_samples': 1024, 'number_of_characters': 348346, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'asm_Beng-tel_Telu': {'num_samples': 1024, 'number_of_characters': 319045, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'asm_Beng-urd_Arab': {'num_samples': 1024, 'number_of_characters': 315134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'ben_Beng-asm_Beng': {'num_samples': 1024, 'number_of_characters': 310622, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'ben_Beng-brx_Deva': {'num_samples': 1024, 'number_of_characters': 313313, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'ben_Beng-doi_Deva': {'num_samples': 1024, 'number_of_characters': 308724, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'ben_Beng-eng_Latn': {'num_samples': 1024, 'number_of_characters': 309802, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'ben_Beng-gom_Deva': {'num_samples': 1024, 'number_of_characters': 302298, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'ben_Beng-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 299144, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'ben_Beng-hin_Deva': {'num_samples': 1024, 'number_of_characters': 309810, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'ben_Beng-kan_Knda': {'num_samples': 1024, 'number_of_characters': 321768, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'ben_Beng-kas_Arab': {'num_samples': 1024, 'number_of_characters': 312468, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'ben_Beng-mai_Deva': {'num_samples': 1024, 'number_of_characters': 298386, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'ben_Beng-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 333340, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'ben_Beng-mar_Deva': {'num_samples': 1024, 'number_of_characters': 311488, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'ben_Beng-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 302838, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'ben_Beng-npi_Deva': {'num_samples': 1024, 'number_of_characters': 303123, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'ben_Beng-ory_Orya': {'num_samples': 1024, 'number_of_characters': 323930, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'ben_Beng-pan_Guru': {'num_samples': 1024, 'number_of_characters': 296567, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'ben_Beng-san_Deva': {'num_samples': 1024, 'number_of_characters': 307783, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'ben_Beng-sat_Olck': {'num_samples': 1024, 'number_of_characters': 316436, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'ben_Beng-snd_Deva': {'num_samples': 1024, 'number_of_characters': 310125, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'ben_Beng-tam_Taml': {'num_samples': 1024, 'number_of_characters': 338050, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'ben_Beng-tel_Telu': {'num_samples': 1024, 'number_of_characters': 308749, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'ben_Beng-urd_Arab': {'num_samples': 1024, 'number_of_characters': 304838, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'brx_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 323609, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'brx_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 313313, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'brx_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 321711, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'brx_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 322789, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'brx_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 315285, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'brx_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 312131, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'brx_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 322797, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'brx_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 334755, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'brx_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 325455, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'brx_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 311373, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'brx_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 346327, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'brx_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 324475, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'brx_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 315825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'brx_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 316110, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'brx_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 336917, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'brx_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 309554, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'brx_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 320770, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'brx_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 329423, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'brx_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 323112, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'brx_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 351037, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'brx_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 321736, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'brx_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 317825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'doi_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 319020, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'doi_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 308724, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'doi_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 321711, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'doi_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 318200, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'doi_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 310696, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'doi_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 307542, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'doi_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 318208, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'doi_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 330166, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'doi_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 320866, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'doi_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 306784, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'doi_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 341738, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'doi_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 319886, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'doi_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 311236, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'doi_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 311521, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'doi_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 332328, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'doi_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304965, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'doi_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 316181, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'doi_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 324834, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'doi_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 318523, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'doi_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 346448, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'doi_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 317147, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'doi_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 313236, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'eng_Latn-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320098, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'eng_Latn-ben_Beng': {'num_samples': 1024, 'number_of_characters': 309802, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'eng_Latn-brx_Deva': {'num_samples': 1024, 'number_of_characters': 322789, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'eng_Latn-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318200, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'eng_Latn-gom_Deva': {'num_samples': 1024, 'number_of_characters': 311774, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'eng_Latn-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308620, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'eng_Latn-hin_Deva': {'num_samples': 1024, 'number_of_characters': 319286, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'eng_Latn-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331244, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'eng_Latn-kas_Arab': {'num_samples': 1024, 'number_of_characters': 321944, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'eng_Latn-mai_Deva': {'num_samples': 1024, 'number_of_characters': 307862, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'eng_Latn-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 342816, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'eng_Latn-mar_Deva': {'num_samples': 1024, 'number_of_characters': 320964, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'eng_Latn-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312314, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'eng_Latn-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312599, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'eng_Latn-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333406, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'eng_Latn-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306043, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'eng_Latn-san_Deva': {'num_samples': 1024, 'number_of_characters': 317259, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'eng_Latn-sat_Olck': {'num_samples': 1024, 'number_of_characters': 325912, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'eng_Latn-snd_Deva': {'num_samples': 1024, 'number_of_characters': 319601, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'eng_Latn-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347526, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'eng_Latn-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318225, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'eng_Latn-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314314, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'gom_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 312594, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'gom_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 302298, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'gom_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 315285, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'gom_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 310696, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'gom_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 311774, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'gom_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301116, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'gom_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 311782, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'gom_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 323740, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'gom_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 314440, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'gom_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 300358, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'gom_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 335312, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'gom_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 313460, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'gom_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 304810, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'gom_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 305095, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'gom_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 325902, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'gom_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 298539, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'gom_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 309755, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'gom_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 318408, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'gom_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312097, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'gom_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340022, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'gom_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 310721, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'gom_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 306810, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'guj_Gujr-asm_Beng': {'num_samples': 1024, 'number_of_characters': 309440, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'guj_Gujr-ben_Beng': {'num_samples': 1024, 'number_of_characters': 299144, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'guj_Gujr-brx_Deva': {'num_samples': 1024, 'number_of_characters': 312131, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'guj_Gujr-doi_Deva': {'num_samples': 1024, 'number_of_characters': 307542, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'guj_Gujr-eng_Latn': {'num_samples': 1024, 'number_of_characters': 308620, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'guj_Gujr-gom_Deva': {'num_samples': 1024, 'number_of_characters': 301116, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'guj_Gujr-hin_Deva': {'num_samples': 1024, 'number_of_characters': 308628, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'guj_Gujr-kan_Knda': {'num_samples': 1024, 'number_of_characters': 320586, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'guj_Gujr-kas_Arab': {'num_samples': 1024, 'number_of_characters': 311286, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'guj_Gujr-mai_Deva': {'num_samples': 1024, 'number_of_characters': 297204, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'guj_Gujr-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 332158, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'guj_Gujr-mar_Deva': {'num_samples': 1024, 'number_of_characters': 310306, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'guj_Gujr-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 301656, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'guj_Gujr-npi_Deva': {'num_samples': 1024, 'number_of_characters': 301941, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'guj_Gujr-ory_Orya': {'num_samples': 1024, 'number_of_characters': 322748, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'guj_Gujr-pan_Guru': {'num_samples': 1024, 'number_of_characters': 295385, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'guj_Gujr-san_Deva': {'num_samples': 1024, 'number_of_characters': 306601, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'guj_Gujr-sat_Olck': {'num_samples': 1024, 'number_of_characters': 315254, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'guj_Gujr-snd_Deva': {'num_samples': 1024, 'number_of_characters': 308943, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'guj_Gujr-tam_Taml': {'num_samples': 1024, 'number_of_characters': 336868, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'guj_Gujr-tel_Telu': {'num_samples': 1024, 'number_of_characters': 307567, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'guj_Gujr-urd_Arab': {'num_samples': 1024, 'number_of_characters': 303656, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'hin_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320106, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'hin_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 309810, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'hin_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 322797, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'hin_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318208, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'hin_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 319286, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'hin_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 311782, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'hin_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308628, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'hin_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331252, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'hin_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 321952, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'hin_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 307870, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'hin_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 342824, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'hin_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 320972, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'hin_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312322, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'hin_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312607, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'hin_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333414, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'hin_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306051, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'hin_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 317267, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'hin_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 325920, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'hin_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 319609, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'hin_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347534, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'hin_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318233, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'hin_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314322, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'kan_Knda-asm_Beng': {'num_samples': 1024, 'number_of_characters': 332064, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'kan_Knda-ben_Beng': {'num_samples': 1024, 'number_of_characters': 321768, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'kan_Knda-brx_Deva': {'num_samples': 1024, 'number_of_characters': 334755, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'kan_Knda-doi_Deva': {'num_samples': 1024, 'number_of_characters': 330166, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'kan_Knda-eng_Latn': {'num_samples': 1024, 'number_of_characters': 331244, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'kan_Knda-gom_Deva': {'num_samples': 1024, 'number_of_characters': 323740, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'kan_Knda-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 320586, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'kan_Knda-hin_Deva': {'num_samples': 1024, 'number_of_characters': 331252, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'kan_Knda-kas_Arab': {'num_samples': 1024, 'number_of_characters': 333910, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'kan_Knda-mai_Deva': {'num_samples': 1024, 'number_of_characters': 319828, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'kan_Knda-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 354782, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'kan_Knda-mar_Deva': {'num_samples': 1024, 'number_of_characters': 332930, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'kan_Knda-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 324280, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'kan_Knda-npi_Deva': {'num_samples': 1024, 'number_of_characters': 324565, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'kan_Knda-ory_Orya': {'num_samples': 1024, 'number_of_characters': 345372, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'kan_Knda-pan_Guru': {'num_samples': 1024, 'number_of_characters': 318009, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'kan_Knda-san_Deva': {'num_samples': 1024, 'number_of_characters': 329225, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'kan_Knda-sat_Olck': {'num_samples': 1024, 'number_of_characters': 337878, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'kan_Knda-snd_Deva': {'num_samples': 1024, 'number_of_characters': 331567, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'kan_Knda-tam_Taml': {'num_samples': 1024, 'number_of_characters': 359492, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'kan_Knda-tel_Telu': {'num_samples': 1024, 'number_of_characters': 330191, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'kan_Knda-urd_Arab': {'num_samples': 1024, 'number_of_characters': 326280, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'kas_Arab-asm_Beng': {'num_samples': 1024, 'number_of_characters': 322764, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'kas_Arab-ben_Beng': {'num_samples': 1024, 'number_of_characters': 312468, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'kas_Arab-brx_Deva': {'num_samples': 1024, 'number_of_characters': 325455, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'kas_Arab-doi_Deva': {'num_samples': 1024, 'number_of_characters': 320866, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'kas_Arab-eng_Latn': {'num_samples': 1024, 'number_of_characters': 321944, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'kas_Arab-gom_Deva': {'num_samples': 1024, 'number_of_characters': 314440, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'kas_Arab-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 311286, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'kas_Arab-hin_Deva': {'num_samples': 1024, 'number_of_characters': 321952, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'kas_Arab-kan_Knda': {'num_samples': 1024, 'number_of_characters': 333910, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'kas_Arab-mai_Deva': {'num_samples': 1024, 'number_of_characters': 310528, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'kas_Arab-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 345482, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'kas_Arab-mar_Deva': {'num_samples': 1024, 'number_of_characters': 323630, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'kas_Arab-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 314980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'kas_Arab-npi_Deva': {'num_samples': 1024, 'number_of_characters': 315265, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'kas_Arab-ory_Orya': {'num_samples': 1024, 'number_of_characters': 336072, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'kas_Arab-pan_Guru': {'num_samples': 1024, 'number_of_characters': 308709, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'kas_Arab-san_Deva': {'num_samples': 1024, 'number_of_characters': 319925, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'kas_Arab-sat_Olck': {'num_samples': 1024, 'number_of_characters': 328578, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'kas_Arab-snd_Deva': {'num_samples': 1024, 'number_of_characters': 322267, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'kas_Arab-tam_Taml': {'num_samples': 1024, 'number_of_characters': 350192, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'kas_Arab-tel_Telu': {'num_samples': 1024, 'number_of_characters': 320891, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'kas_Arab-urd_Arab': {'num_samples': 1024, 'number_of_characters': 316980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mai_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 308682, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mai_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 298386, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mai_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 311373, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mai_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 306784, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mai_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 307862, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mai_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 300358, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mai_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 297204, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mai_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 307870, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mai_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 319828, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mai_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 310528, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mai_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 331400, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mai_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 309548, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mai_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 300898, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mai_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 301183, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mai_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 321990, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mai_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 294627, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mai_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 305843, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mai_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 314496, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mai_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 308185, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mai_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 336110, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mai_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 306809, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mai_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 302898, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mal_Mlym-asm_Beng': {'num_samples': 1024, 'number_of_characters': 343636, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mal_Mlym-ben_Beng': {'num_samples': 1024, 'number_of_characters': 333340, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mal_Mlym-brx_Deva': {'num_samples': 1024, 'number_of_characters': 346327, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mal_Mlym-doi_Deva': {'num_samples': 1024, 'number_of_characters': 341738, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mal_Mlym-eng_Latn': {'num_samples': 1024, 'number_of_characters': 342816, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mal_Mlym-gom_Deva': {'num_samples': 1024, 'number_of_characters': 335312, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mal_Mlym-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 332158, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mal_Mlym-hin_Deva': {'num_samples': 1024, 'number_of_characters': 342824, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mal_Mlym-kan_Knda': {'num_samples': 1024, 'number_of_characters': 354782, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mal_Mlym-kas_Arab': {'num_samples': 1024, 'number_of_characters': 345482, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mal_Mlym-mai_Deva': {'num_samples': 1024, 'number_of_characters': 331400, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mal_Mlym-mar_Deva': {'num_samples': 1024, 'number_of_characters': 344502, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mal_Mlym-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 335852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mal_Mlym-npi_Deva': {'num_samples': 1024, 'number_of_characters': 336137, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mal_Mlym-ory_Orya': {'num_samples': 1024, 'number_of_characters': 356944, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mal_Mlym-pan_Guru': {'num_samples': 1024, 'number_of_characters': 329581, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mal_Mlym-san_Deva': {'num_samples': 1024, 'number_of_characters': 340797, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mal_Mlym-sat_Olck': {'num_samples': 1024, 'number_of_characters': 349450, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mal_Mlym-snd_Deva': {'num_samples': 1024, 'number_of_characters': 343139, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mal_Mlym-tam_Taml': {'num_samples': 1024, 'number_of_characters': 371064, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mal_Mlym-tel_Telu': {'num_samples': 1024, 'number_of_characters': 341763, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mal_Mlym-urd_Arab': {'num_samples': 1024, 'number_of_characters': 337852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mar_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 321784, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mar_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 311488, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mar_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 324475, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mar_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 319886, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mar_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 320964, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mar_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 313460, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mar_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 310306, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mar_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 320972, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mar_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 332930, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mar_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 323630, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mar_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 309548, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mar_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 344502, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mar_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 314000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mar_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 314285, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mar_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 335092, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mar_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 307729, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mar_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 318945, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mar_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 327598, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mar_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 321287, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mar_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 349212, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mar_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 319911, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mar_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 316000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mni_Mtei-asm_Beng': {'num_samples': 1024, 'number_of_characters': 313134, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mni_Mtei-ben_Beng': {'num_samples': 1024, 'number_of_characters': 302838, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mni_Mtei-brx_Deva': {'num_samples': 1024, 'number_of_characters': 315825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mni_Mtei-doi_Deva': {'num_samples': 1024, 'number_of_characters': 311236, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mni_Mtei-eng_Latn': {'num_samples': 1024, 'number_of_characters': 312314, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mni_Mtei-gom_Deva': {'num_samples': 1024, 'number_of_characters': 304810, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mni_Mtei-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301656, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mni_Mtei-hin_Deva': {'num_samples': 1024, 'number_of_characters': 312322, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mni_Mtei-kan_Knda': {'num_samples': 1024, 'number_of_characters': 324280, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mni_Mtei-kas_Arab': {'num_samples': 1024, 'number_of_characters': 314980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mni_Mtei-mai_Deva': {'num_samples': 1024, 'number_of_characters': 300898, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mni_Mtei-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 335852, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mni_Mtei-mar_Deva': {'num_samples': 1024, 'number_of_characters': 314000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mni_Mtei-npi_Deva': {'num_samples': 1024, 'number_of_characters': 305635, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mni_Mtei-ory_Orya': {'num_samples': 1024, 'number_of_characters': 326442, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mni_Mtei-pan_Guru': {'num_samples': 1024, 'number_of_characters': 299079, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mni_Mtei-san_Deva': {'num_samples': 1024, 'number_of_characters': 310295, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mni_Mtei-sat_Olck': {'num_samples': 1024, 'number_of_characters': 318948, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mni_Mtei-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312637, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mni_Mtei-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340562, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mni_Mtei-tel_Telu': {'num_samples': 1024, 'number_of_characters': 311261, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mni_Mtei-urd_Arab': {'num_samples': 1024, 'number_of_characters': 307350, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'npi_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 313419, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'npi_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 303123, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'npi_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 316110, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'npi_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 311521, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'npi_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 312599, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'npi_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 305095, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'npi_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301941, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'npi_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 312607, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'npi_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 324565, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'npi_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 315265, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'npi_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 301183, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'npi_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 336137, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'npi_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 314285, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'npi_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 305635, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'npi_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 326727, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'npi_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 299364, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'npi_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 310580, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'npi_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 319233, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'npi_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312922, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'npi_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340847, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'npi_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 311546, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'npi_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 307635, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'ory_Orya-asm_Beng': {'num_samples': 1024, 'number_of_characters': 334226, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'ory_Orya-ben_Beng': {'num_samples': 1024, 'number_of_characters': 323930, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'ory_Orya-brx_Deva': {'num_samples': 1024, 'number_of_characters': 336917, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'ory_Orya-doi_Deva': {'num_samples': 1024, 'number_of_characters': 332328, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'ory_Orya-eng_Latn': {'num_samples': 1024, 'number_of_characters': 333406, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'ory_Orya-gom_Deva': {'num_samples': 1024, 'number_of_characters': 325902, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'ory_Orya-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 322748, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'ory_Orya-hin_Deva': {'num_samples': 1024, 'number_of_characters': 333414, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'ory_Orya-kan_Knda': {'num_samples': 1024, 'number_of_characters': 345372, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'ory_Orya-kas_Arab': {'num_samples': 1024, 'number_of_characters': 336072, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'ory_Orya-mai_Deva': {'num_samples': 1024, 'number_of_characters': 321990, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'ory_Orya-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 356944, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'ory_Orya-mar_Deva': {'num_samples': 1024, 'number_of_characters': 335092, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'ory_Orya-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 326442, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'ory_Orya-npi_Deva': {'num_samples': 1024, 'number_of_characters': 326727, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'ory_Orya-pan_Guru': {'num_samples': 1024, 'number_of_characters': 320171, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'ory_Orya-san_Deva': {'num_samples': 1024, 'number_of_characters': 331387, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'ory_Orya-sat_Olck': {'num_samples': 1024, 'number_of_characters': 340040, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'ory_Orya-snd_Deva': {'num_samples': 1024, 'number_of_characters': 333729, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'ory_Orya-tam_Taml': {'num_samples': 1024, 'number_of_characters': 361654, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'ory_Orya-tel_Telu': {'num_samples': 1024, 'number_of_characters': 332353, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'ory_Orya-urd_Arab': {'num_samples': 1024, 'number_of_characters': 328442, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'pan_Guru-asm_Beng': {'num_samples': 1024, 'number_of_characters': 306863, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'pan_Guru-ben_Beng': {'num_samples': 1024, 'number_of_characters': 296567, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'pan_Guru-brx_Deva': {'num_samples': 1024, 'number_of_characters': 309554, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'pan_Guru-doi_Deva': {'num_samples': 1024, 'number_of_characters': 304965, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'pan_Guru-eng_Latn': {'num_samples': 1024, 'number_of_characters': 306043, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'pan_Guru-gom_Deva': {'num_samples': 1024, 'number_of_characters': 298539, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'pan_Guru-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 295385, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'pan_Guru-hin_Deva': {'num_samples': 1024, 'number_of_characters': 306051, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'pan_Guru-kan_Knda': {'num_samples': 1024, 'number_of_characters': 318009, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'pan_Guru-kas_Arab': {'num_samples': 1024, 'number_of_characters': 308709, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'pan_Guru-mai_Deva': {'num_samples': 1024, 'number_of_characters': 294627, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'pan_Guru-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 329581, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'pan_Guru-mar_Deva': {'num_samples': 1024, 'number_of_characters': 307729, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'pan_Guru-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 299079, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'pan_Guru-npi_Deva': {'num_samples': 1024, 'number_of_characters': 299364, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'pan_Guru-ory_Orya': {'num_samples': 1024, 'number_of_characters': 320171, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'pan_Guru-san_Deva': {'num_samples': 1024, 'number_of_characters': 304024, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'pan_Guru-sat_Olck': {'num_samples': 1024, 'number_of_characters': 312677, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'pan_Guru-snd_Deva': {'num_samples': 1024, 'number_of_characters': 306366, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'pan_Guru-tam_Taml': {'num_samples': 1024, 'number_of_characters': 334291, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'pan_Guru-tel_Telu': {'num_samples': 1024, 'number_of_characters': 304990, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'pan_Guru-urd_Arab': {'num_samples': 1024, 'number_of_characters': 301079, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'san_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 318079, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'san_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 307783, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'san_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 320770, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'san_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 316181, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'san_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 317259, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'san_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 309755, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'san_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 306601, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'san_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 317267, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'san_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 329225, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'san_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 319925, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'san_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 305843, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'san_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 340797, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'san_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 318945, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'san_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 310295, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'san_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 310580, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'san_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 331387, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'san_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304024, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'san_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 323893, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'san_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 317582, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'san_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 345507, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'san_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 316206, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'san_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 312295, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'sat_Olck-asm_Beng': {'num_samples': 1024, 'number_of_characters': 326732, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'sat_Olck-ben_Beng': {'num_samples': 1024, 'number_of_characters': 316436, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'sat_Olck-brx_Deva': {'num_samples': 1024, 'number_of_characters': 329423, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'sat_Olck-doi_Deva': {'num_samples': 1024, 'number_of_characters': 324834, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'sat_Olck-eng_Latn': {'num_samples': 1024, 'number_of_characters': 325912, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'sat_Olck-gom_Deva': {'num_samples': 1024, 'number_of_characters': 318408, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'sat_Olck-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 315254, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'sat_Olck-hin_Deva': {'num_samples': 1024, 'number_of_characters': 325920, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'sat_Olck-kan_Knda': {'num_samples': 1024, 'number_of_characters': 337878, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'sat_Olck-kas_Arab': {'num_samples': 1024, 'number_of_characters': 328578, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'sat_Olck-mai_Deva': {'num_samples': 1024, 'number_of_characters': 314496, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'sat_Olck-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 349450, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'sat_Olck-mar_Deva': {'num_samples': 1024, 'number_of_characters': 327598, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'sat_Olck-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 318948, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'sat_Olck-npi_Deva': {'num_samples': 1024, 'number_of_characters': 319233, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'sat_Olck-ory_Orya': {'num_samples': 1024, 'number_of_characters': 340040, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'sat_Olck-pan_Guru': {'num_samples': 1024, 'number_of_characters': 312677, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'sat_Olck-san_Deva': {'num_samples': 1024, 'number_of_characters': 323893, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'sat_Olck-snd_Deva': {'num_samples': 1024, 'number_of_characters': 326235, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'sat_Olck-tam_Taml': {'num_samples': 1024, 'number_of_characters': 354160, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'sat_Olck-tel_Telu': {'num_samples': 1024, 'number_of_characters': 324859, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'sat_Olck-urd_Arab': {'num_samples': 1024, 'number_of_characters': 320948, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'snd_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320421, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'snd_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 310125, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'snd_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 323112, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'snd_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318523, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'snd_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 319601, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'snd_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 312097, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'snd_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308943, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'snd_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 319609, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'snd_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331567, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'snd_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 322267, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'snd_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 308185, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'snd_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 343139, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'snd_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 321287, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'snd_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312637, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'snd_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312922, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'snd_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333729, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'snd_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306366, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'snd_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 317582, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'snd_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 326235, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'snd_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347849, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'snd_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318548, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'snd_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314637, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'tam_Taml-asm_Beng': {'num_samples': 1024, 'number_of_characters': 348346, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'tam_Taml-ben_Beng': {'num_samples': 1024, 'number_of_characters': 338050, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'tam_Taml-brx_Deva': {'num_samples': 1024, 'number_of_characters': 351037, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'tam_Taml-doi_Deva': {'num_samples': 1024, 'number_of_characters': 346448, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'tam_Taml-eng_Latn': {'num_samples': 1024, 'number_of_characters': 347526, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'tam_Taml-gom_Deva': {'num_samples': 1024, 'number_of_characters': 340022, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'tam_Taml-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 336868, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'tam_Taml-hin_Deva': {'num_samples': 1024, 'number_of_characters': 347534, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'tam_Taml-kan_Knda': {'num_samples': 1024, 'number_of_characters': 359492, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'tam_Taml-kas_Arab': {'num_samples': 1024, 'number_of_characters': 350192, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'tam_Taml-mai_Deva': {'num_samples': 1024, 'number_of_characters': 336110, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'tam_Taml-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 371064, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'tam_Taml-mar_Deva': {'num_samples': 1024, 'number_of_characters': 349212, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'tam_Taml-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 340562, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'tam_Taml-npi_Deva': {'num_samples': 1024, 'number_of_characters': 340847, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'tam_Taml-ory_Orya': {'num_samples': 1024, 'number_of_characters': 361654, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'tam_Taml-pan_Guru': {'num_samples': 1024, 'number_of_characters': 334291, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'tam_Taml-san_Deva': {'num_samples': 1024, 'number_of_characters': 345507, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'tam_Taml-sat_Olck': {'num_samples': 1024, 'number_of_characters': 354160, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'tam_Taml-snd_Deva': {'num_samples': 1024, 'number_of_characters': 347849, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'tam_Taml-tel_Telu': {'num_samples': 1024, 'number_of_characters': 346473, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'tam_Taml-urd_Arab': {'num_samples': 1024, 'number_of_characters': 342562, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'tel_Telu-asm_Beng': {'num_samples': 1024, 'number_of_characters': 319045, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'tel_Telu-ben_Beng': {'num_samples': 1024, 'number_of_characters': 308749, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'tel_Telu-brx_Deva': {'num_samples': 1024, 'number_of_characters': 321736, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'tel_Telu-doi_Deva': {'num_samples': 1024, 'number_of_characters': 317147, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'tel_Telu-eng_Latn': {'num_samples': 1024, 'number_of_characters': 318225, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'tel_Telu-gom_Deva': {'num_samples': 1024, 'number_of_characters': 310721, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'tel_Telu-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 307567, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'tel_Telu-hin_Deva': {'num_samples': 1024, 'number_of_characters': 318233, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'tel_Telu-kan_Knda': {'num_samples': 1024, 'number_of_characters': 330191, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'tel_Telu-kas_Arab': {'num_samples': 1024, 'number_of_characters': 320891, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'tel_Telu-mai_Deva': {'num_samples': 1024, 'number_of_characters': 306809, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'tel_Telu-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 341763, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'tel_Telu-mar_Deva': {'num_samples': 1024, 'number_of_characters': 319911, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'tel_Telu-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 311261, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'tel_Telu-npi_Deva': {'num_samples': 1024, 'number_of_characters': 311546, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'tel_Telu-ory_Orya': {'num_samples': 1024, 'number_of_characters': 332353, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'tel_Telu-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304990, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'tel_Telu-san_Deva': {'num_samples': 1024, 'number_of_characters': 316206, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'tel_Telu-sat_Olck': {'num_samples': 1024, 'number_of_characters': 324859, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'tel_Telu-snd_Deva': {'num_samples': 1024, 'number_of_characters': 318548, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'tel_Telu-tam_Taml': {'num_samples': 1024, 'number_of_characters': 346473, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'tel_Telu-urd_Arab': {'num_samples': 1024, 'number_of_characters': 313261, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'urd_Arab-asm_Beng': {'num_samples': 1024, 'number_of_characters': 315134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'urd_Arab-ben_Beng': {'num_samples': 1024, 'number_of_characters': 304838, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'urd_Arab-brx_Deva': {'num_samples': 1024, 'number_of_characters': 317825, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'urd_Arab-doi_Deva': {'num_samples': 1024, 'number_of_characters': 313236, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'urd_Arab-eng_Latn': {'num_samples': 1024, 'number_of_characters': 314314, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'urd_Arab-gom_Deva': {'num_samples': 1024, 'number_of_characters': 306810, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'urd_Arab-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 303656, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'urd_Arab-hin_Deva': {'num_samples': 1024, 'number_of_characters': 314322, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'urd_Arab-kan_Knda': {'num_samples': 1024, 'number_of_characters': 326280, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'urd_Arab-kas_Arab': {'num_samples': 1024, 'number_of_characters': 316980, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'urd_Arab-mai_Deva': {'num_samples': 1024, 'number_of_characters': 302898, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'urd_Arab-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 337852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'urd_Arab-mar_Deva': {'num_samples': 1024, 'number_of_characters': 316000, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'urd_Arab-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 307350, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'urd_Arab-npi_Deva': {'num_samples': 1024, 'number_of_characters': 307635, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'urd_Arab-ory_Orya': {'num_samples': 1024, 'number_of_characters': 328442, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'urd_Arab-pan_Guru': {'num_samples': 1024, 'number_of_characters': 301079, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'urd_Arab-san_Deva': {'num_samples': 1024, 'number_of_characters': 312295, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'urd_Arab-sat_Olck': {'num_samples': 1024, 'number_of_characters': 320948, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'urd_Arab-snd_Deva': {'num_samples': 1024, 'number_of_characters': 314637, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'urd_Arab-tam_Taml': {'num_samples': 1024, 'number_of_characters': 342562, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'urd_Arab-tel_Telu': {'num_samples': 1024, 'number_of_characters': 313261, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}}}} | | [IWSLT2017BitextMining](https://aclanthology.org/2017.iwslt-1.1/) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'jpn', 'kor', 'nld', 'ron'] | BitextMining | s2s | [Fiction, Non-fiction, Written] | {'validation': 21938} | {'validation': {'num_samples': 21938, 'number_of_characters': 4256244, 'unique_pairs': 21840, 'min_sentence1_length': 2, 'average_sentence1_length': 97.01, 'max_sentence1_length': 521, 'unique_sentence1': 11563, 'min_sentence2_length': 2, 'average_sentence2_length': 97.01, 'max_sentence2_length': 521, 'unique_sentence2': 11563, 'hf_subset_descriptive_stats': {'ar-en': {'num_samples': 888, 'number_of_characters': 172499, 'unique_pairs': 887, 'min_sentence1_length': 4, 'average_sentence1_length': 85.49, 'max_sentence1_length': 369, 'unique_sentence1': 887, 'min_sentence2_length': 10, 'average_sentence2_length': 108.77, 'max_sentence2_length': 462, 'unique_sentence2': 881}, 'de-en': {'num_samples': 888, 'number_of_characters': 202336, 'unique_pairs': 883, 'min_sentence1_length': 6, 'average_sentence1_length': 119.03, 'max_sentence1_length': 521, 'unique_sentence1': 881, 'min_sentence2_length': 10, 'average_sentence2_length': 108.83, 'max_sentence2_length': 462, 'unique_sentence2': 881}, 'en-ar': {'num_samples': 888, 'number_of_characters': 172499, 'unique_pairs': 887, 'min_sentence1_length': 10, 'average_sentence1_length': 108.77, 'max_sentence1_length': 462, 'unique_sentence1': 881, 'min_sentence2_length': 4, 'average_sentence2_length': 85.49, 'max_sentence2_length': 369, 'unique_sentence2': 887}, 'en-de': {'num_samples': 888, 'number_of_characters': 202336, 'unique_pairs': 883, 'min_sentence1_length': 10, 'average_sentence1_length': 108.83, 'max_sentence1_length': 462, 'unique_sentence1': 881, 'min_sentence2_length': 6, 'average_sentence2_length': 119.03, 'max_sentence2_length': 521, 'unique_sentence2': 881}, 'en-fr': {'num_samples': 890, 'number_of_characters': 197619, 'unique_pairs': 883, 'min_sentence1_length': 10, 'average_sentence1_length': 108.41, 'max_sentence1_length': 462, 'unique_sentence1': 883, 'min_sentence2_length': 6, 'average_sentence2_length': 113.63, 'max_sentence2_length': 493, 'unique_sentence2': 881}, 'en-it': {'num_samples': 929, 'number_of_characters': 191803, 'unique_pairs': 924, 'min_sentence1_length': 10, 'average_sentence1_length': 103.0, 'max_sentence1_length': 433, 'unique_sentence1': 922, 'min_sentence2_length': 7, 'average_sentence2_length': 103.46, 'max_sentence2_length': 444, 'unique_sentence2': 918}, 'en-ja': {'num_samples': 871, 'number_of_characters': 132742, 'unique_pairs': 867, 'min_sentence1_length': 10, 'average_sentence1_length': 109.81, 'max_sentence1_length': 462, 'unique_sentence1': 864, 'min_sentence2_length': 5, 'average_sentence2_length': 42.59, 'max_sentence2_length': 225, 'unique_sentence2': 866}, 'en-ko': {'num_samples': 879, 'number_of_characters': 142659, 'unique_pairs': 874, 'min_sentence1_length': 10, 'average_sentence1_length': 107.74, 'max_sentence1_length': 462, 'unique_sentence1': 872, 'min_sentence2_length': 3, 'average_sentence2_length': 54.56, 'max_sentence2_length': 250, 'unique_sentence2': 872}, 'en-nl': {'num_samples': 1003, 'number_of_characters': 189637, 'unique_pairs': 1000, 'min_sentence1_length': 10, 'average_sentence1_length': 95.27, 'max_sentence1_length': 433, 'unique_sentence1': 996, 'min_sentence2_length': 4, 'average_sentence2_length': 93.8, 'max_sentence2_length': 477, 'unique_sentence2': 1000}, 'en-ro': {'num_samples': 914, 'number_of_characters': 194128, 'unique_pairs': 910, 'min_sentence1_length': 10, 'average_sentence1_length': 104.72, 'max_sentence1_length': 433, 'unique_sentence1': 907, 'min_sentence2_length': 9, 'average_sentence2_length': 107.67, 'max_sentence2_length': 448, 'unique_sentence2': 910}, 'en-zh': {'num_samples': 879, 'number_of_characters': 131126, 'unique_pairs': 877, 'min_sentence1_length': 10, 'average_sentence1_length': 109.37, 'max_sentence1_length': 462, 'unique_sentence1': 872, 'min_sentence2_length': 2, 'average_sentence2_length': 39.81, 'max_sentence2_length': 230, 'unique_sentence2': 867}, 'fr-en': {'num_samples': 890, 'number_of_characters': 197619, 'unique_pairs': 883, 'min_sentence1_length': 6, 'average_sentence1_length': 113.63, 'max_sentence1_length': 493, 'unique_sentence1': 881, 'min_sentence2_length': 10, 'average_sentence2_length': 108.41, 'max_sentence2_length': 462, 'unique_sentence2': 883}, 'it-en': {'num_samples': 929, 'number_of_characters': 191803, 'unique_pairs': 924, 'min_sentence1_length': 7, 'average_sentence1_length': 103.46, 'max_sentence1_length': 444, 'unique_sentence1': 918, 'min_sentence2_length': 10, 'average_sentence2_length': 103.0, 'max_sentence2_length': 433, 'unique_sentence2': 922}, 'it-nl': {'num_samples': 1001, 'number_of_characters': 188858, 'unique_pairs': 998, 'min_sentence1_length': 7, 'average_sentence1_length': 94.64, 'max_sentence1_length': 459, 'unique_sentence1': 994, 'min_sentence2_length': 7, 'average_sentence2_length': 94.03, 'max_sentence2_length': 505, 'unique_sentence2': 998}, 'it-ro': {'num_samples': 914, 'number_of_characters': 193339, 'unique_pairs': 911, 'min_sentence1_length': 7, 'average_sentence1_length': 103.91, 'max_sentence1_length': 435, 'unique_sentence1': 907, 'min_sentence2_length': 9, 'average_sentence2_length': 107.62, 'max_sentence2_length': 448, 'unique_sentence2': 910}, 'ja-en': {'num_samples': 871, 'number_of_characters': 132742, 'unique_pairs': 867, 'min_sentence1_length': 5, 'average_sentence1_length': 42.59, 'max_sentence1_length': 225, 'unique_sentence1': 866, 'min_sentence2_length': 10, 'average_sentence2_length': 109.81, 'max_sentence2_length': 462, 'unique_sentence2': 864}, 'ko-en': {'num_samples': 879, 'number_of_characters': 142659, 'unique_pairs': 874, 'min_sentence1_length': 3, 'average_sentence1_length': 54.56, 'max_sentence1_length': 250, 'unique_sentence1': 872, 'min_sentence2_length': 10, 'average_sentence2_length': 107.74, 'max_sentence2_length': 462, 'unique_sentence2': 872}, 'nl-en': {'num_samples': 1003, 'number_of_characters': 189637, 'unique_pairs': 1000, 'min_sentence1_length': 4, 'average_sentence1_length': 93.8, 'max_sentence1_length': 477, 'unique_sentence1': 1000, 'min_sentence2_length': 10, 'average_sentence2_length': 95.27, 'max_sentence2_length': 433, 'unique_sentence2': 996}, 'nl-it': {'num_samples': 1001, 'number_of_characters': 188858, 'unique_pairs': 998, 'min_sentence1_length': 7, 'average_sentence1_length': 94.03, 'max_sentence1_length': 505, 'unique_sentence1': 998, 'min_sentence2_length': 7, 'average_sentence2_length': 94.64, 'max_sentence2_length': 459, 'unique_sentence2': 994}, 'nl-ro': {'num_samples': 913, 'number_of_characters': 191376, 'unique_pairs': 911, 'min_sentence1_length': 7, 'average_sentence1_length': 102.02, 'max_sentence1_length': 478, 'unique_sentence1': 909, 'min_sentence2_length': 9, 'average_sentence2_length': 107.59, 'max_sentence2_length': 515, 'unique_sentence2': 909}, 'ro-en': {'num_samples': 914, 'number_of_characters': 194128, 'unique_pairs': 910, 'min_sentence1_length': 9, 'average_sentence1_length': 107.67, 'max_sentence1_length': 448, 'unique_sentence1': 910, 'min_sentence2_length': 10, 'average_sentence2_length': 104.72, 'max_sentence2_length': 433, 'unique_sentence2': 907}, 'ro-it': {'num_samples': 914, 'number_of_characters': 193339, 'unique_pairs': 911, 'min_sentence1_length': 9, 'average_sentence1_length': 107.62, 'max_sentence1_length': 448, 'unique_sentence1': 910, 'min_sentence2_length': 7, 'average_sentence2_length': 103.91, 'max_sentence2_length': 435, 'unique_sentence2': 907}, 'ro-nl': {'num_samples': 913, 'number_of_characters': 191376, 'unique_pairs': 911, 'min_sentence1_length': 9, 'average_sentence1_length': 107.59, 'max_sentence1_length': 515, 'unique_sentence1': 909, 'min_sentence2_length': 7, 'average_sentence2_length': 102.02, 'max_sentence2_length': 478, 'unique_sentence2': 909}, 'zh-en': {'num_samples': 879, 'number_of_characters': 131126, 'unique_pairs': 877, 'min_sentence1_length': 2, 'average_sentence1_length': 39.81, 'max_sentence1_length': 230, 'unique_sentence1': 867, 'min_sentence2_length': 10, 'average_sentence2_length': 109.37, 'max_sentence2_length': 462, 'unique_sentence2': 872}}}} | -| [ImageCoDeT2IMultiChoice](https://aclanthology.org/2022.acl-long.241.pdf) (Krojer et al., 2022) | ['eng'] | Compositionality | it2i | [Web, Written] | {'test': 25322} | {'test': {'number_of_characters': 236457, 'num_samples': 25322, 'num_queries': 2302, 'num_documents': 23020, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 23020, 'min_query_length': 1, 'average_query_length': 102.72, 'max_query_length': 350, 'unique_queries': 2302, 'num_query_images': 0, 'min_query_image_width': 0, 'average_query_image_width': 0, 'max_query_image_width': 0, 'min_query_image_height': 0, 'average_query_image_height': 0, 'max_query_image_height': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10390}} | +| [ImageCoDe](https://aclanthology.org/2022.acl-long.241.pdf) (Krojer et al., 2022) | ['eng'] | Compositionality | it2i | [Web, Written] | {'test': 2302} | {'test': {'num_samples': 2302, 'num_images': 23020, 'num_texts': 2302, 'num_unique_texts': 2302, 'min_text_length': 1, 'average_text_length': 102.72, 'max_text_length': 350}} | | [ImageCoDeT2IRetrieval](https://aclanthology.org/2022.acl-long.241.pdf) (Krojer et al., 2022) | ['eng'] | Any2AnyRetrieval | t2i | [Web, Written] | {'test': 25322} | {'test': {'number_of_characters': 236457, 'num_samples': 25322, 'num_queries': 2302, 'num_documents': 23020, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 23020, 'min_query_length': 1, 'average_query_length': 102.72, 'max_query_length': 350, 'unique_queries': 2302, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2302}} | | [ImageNet10Clustering](https://www.kaggle.com/datasets/liusha249/imagenet10) (Deng et al., 2009) | ['eng'] | ImageClustering | i2t | [Web] | {'test': 13000} | {'test': {'num_samples': 13000, 'unique_num_labels': 10, 'min_image_width': 224, 'average_image_width': 224.0, 'max_image_width': 224, 'min_image_height': 224, 'average_image_height': 224.0, 'max_image_height': 224, 'labels': {'0': {'count': 1300}, '1': {'count': 1300}, '2': {'count': 1300}, '3': {'count': 1300}, '4': {'count': 1300}, '5': {'count': 1300}, '6': {'count': 1300}, '7': {'count': 1300}, '8': {'count': 1300}, '10': {'count': 1300}}}} | | [ImageNetDog15Clustering](http://vision.stanford.edu/aditya86/ImageNetDogs/main.html) (Deng et al., 2009) | ['eng'] | ImageClustering | i2i | [Web] | {'test': 1076} | {'test': {'num_samples': 1076, 'unique_num_labels': 15, 'min_image_width': 224, 'average_image_width': 224.0, 'max_image_width': 224, 'min_image_height': 224, 'average_image_height': 224.0, 'max_image_height': 224, 'labels': {'0': {'count': 152}, '1': {'count': 88}, '2': {'count': 75}, '3': {'count': 96}, '4': {'count': 57}, '5': {'count': 50}, '6': {'count': 52}, '7': {'count': 50}, '8': {'count': 50}, '9': {'count': 50}, '10': {'count': 53}, '11': {'count': 57}, '12': {'count': 50}, '13': {'count': 100}, '14': {'count': 96}}}} | @@ -383,6 +383,8 @@ The following tables give you an overview of the tasks in MTEB. | [KLUE-TC](https://arxiv.org/abs/2105.09680) (Sungjoon Park, 2021) | ['kor'] | Classification | s2s | [News, Written] | None | None | | [KannadaNewsClassification](https://github.com/goru001/nlp-for-kannada) (Anoop Kunchukuttan, 2020) | ['kan'] | Classification | s2s | [News, Written] | None | None | | [KinopoiskClassification](https://www.dialog-21.ru/media/1226/blinovpd.pdf) (Blinov et al., 2013) | ['rus'] | Classification | p2p | [Reviews, Written] | None | None | +| [KlueMrcDomainClustering](https://huggingface.co/datasets/on-and-on/clustering_klue_mrc_context_domain) (Sungjoon Park, 2021) | ['kor'] | Clustering | p2p | [News, Written] | None | None | +| [KlueYnatMrcCategoryClustering](https://huggingface.co/datasets/on-and-on/clustering_klue_mrc_ynat_title) (Sungjoon Park, 2021) | ['kor'] | Clustering | s2s | [News, Written] | None | None | | Ko-StrategyQA (Geva et al., 2021) | ['kor'] | Retrieval | s2p | | None | None | | [KorFin](https://huggingface.co/datasets/amphora/korfin-asc) (Son et al., 2023) | ['kor'] | Classification | s2s | [Financial, News, Written] | None | None | | [KorHateClassification](https://paperswithcode.com/dataset/korean-hatespeech-dataset) (Jihyung Moon, 2020) | ['kor'] | Classification | s2s | [Social, Written] | None | None | @@ -594,19 +596,13 @@ The following tables give you an overview of the tasks in MTEB. | [RARbMath](https://arxiv.org/abs/2404.06347) (Xiao et al., 2024) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [RESISC45](https://ieeexplore.ieee.org/abstract/document/7891544) (Cheng et al., 2017) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 6300} | {'test': {'num_samples': 6300, 'unique_num_labels': 45, 'min_image_width': 256, 'average_image_width': 256.0, 'max_image_width': 256, 'min_image_height': 256, 'average_image_height': 256.0, 'max_image_height': 256, 'labels': {'31': {'count': 135}, '11': {'count': 144}, '28': {'count': 135}, '43': {'count': 154}, '41': {'count': 144}, '33': {'count': 134}, '19': {'count': 130}, '16': {'count': 127}, '22': {'count': 130}, '34': {'count': 143}, '24': {'count': 164}, '0': {'count': 169}, '13': {'count': 146}, '25': {'count': 115}, '6': {'count': 132}, '36': {'count': 135}, '39': {'count': 142}, '18': {'count': 140}, '23': {'count': 147}, '37': {'count': 159}, '15': {'count': 122}, '29': {'count': 140}, '9': {'count': 159}, '27': {'count': 140}, '21': {'count': 131}, '3': {'count': 134}, '1': {'count': 162}, '32': {'count': 153}, '26': {'count': 150}, '35': {'count': 151}, '44': {'count': 118}, '30': {'count': 154}, '20': {'count': 139}, '4': {'count': 130}, '42': {'count': 127}, '40': {'count': 137}, '5': {'count': 140}, '17': {'count': 142}, '2': {'count': 123}, '38': {'count': 130}, '10': {'count': 140}, '12': {'count': 146}, '8': {'count': 146}, '7': {'count': 143}, '14': {'count': 118}}}} | | [RESISC45ZeroShot](https://ieeexplore.ieee.org/abstract/document/7891544) (Cheng et al., 2017) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | {'test': 6300} | {'test': {'num_samples': 6300, 'unique_num_labels': 45, 'min_image_width': 256, 'average_image_width': 256.0, 'max_image_width': 256, 'min_image_height': 256, 'average_image_height': 256.0, 'max_image_height': 256, 'min_label_text_length': 26, 'average_label_text_length': 32.16, 'max_label_text_length': 43, 'labels': {'31': {'count': 135}, '11': {'count': 144}, '28': {'count': 135}, '43': {'count': 154}, '41': {'count': 144}, '33': {'count': 134}, '19': {'count': 130}, '16': {'count': 127}, '22': {'count': 130}, '34': {'count': 143}, '24': {'count': 164}, '0': {'count': 169}, '13': {'count': 146}, '25': {'count': 115}, '6': {'count': 132}, '36': {'count': 135}, '39': {'count': 142}, '18': {'count': 140}, '23': {'count': 147}, '37': {'count': 159}, '15': {'count': 122}, '29': {'count': 140}, '9': {'count': 159}, '27': {'count': 140}, '21': {'count': 131}, '3': {'count': 134}, '1': {'count': 162}, '32': {'count': 153}, '26': {'count': 150}, '35': {'count': 151}, '44': {'count': 118}, '30': {'count': 154}, '20': {'count': 139}, '4': {'count': 130}, '42': {'count': 127}, '40': {'count': 137}, '5': {'count': 140}, '17': {'count': 142}, '2': {'count': 123}, '38': {'count': 130}, '10': {'count': 140}, '12': {'count': 146}, '8': {'count': 146}, '7': {'count': 143}, '14': {'count': 118}}}} | -| [ROxfordEasyI2IMultiChoice](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 5063} | {'test': {'number_of_characters': 0, 'num_samples': 5063, 'num_queries': 70, 'num_documents': 4993, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 4993, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 0, 'average_relevant_docs_per_query': 43.27, 'max_relevant_docs_per_query': 248, 'unique_relevant_docs': 4993}} | -| [ROxfordEasyI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 584} | {'test': {'number_of_characters': 0, 'num_samples': 584, 'num_queries': 68, 'num_documents': 516, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 516, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 68, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 44.54, 'max_relevant_docs_per_query': 248, 'unique_relevant_docs': 516}} | -| [ROxfordHardI2IMultiChoice](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 5063} | {'test': {'number_of_characters': 0, 'num_samples': 5063, 'num_queries': 70, 'num_documents': 4993, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 4993, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 35.67, 'max_relevant_docs_per_query': 284, 'unique_relevant_docs': 4993}} | -| [ROxfordHardI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 755} | {'test': {'number_of_characters': 0, 'num_samples': 755, 'num_queries': 70, 'num_documents': 685, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 685, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 35.67, 'max_relevant_docs_per_query': 284, 'unique_relevant_docs': 685}} | -| [ROxfordMediumI2IMultiChoice](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 5063} | {'test': {'number_of_characters': 0, 'num_samples': 5063, 'num_queries': 70, 'num_documents': 4993, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 4993, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 2, 'average_relevant_docs_per_query': 78.94, 'max_relevant_docs_per_query': 347, 'unique_relevant_docs': 4993}} | -| [ROxfordMediumI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 858} | {'test': {'number_of_characters': 0, 'num_samples': 858, 'num_queries': 70, 'num_documents': 788, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 788, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_relevant_docs_per_query': 2, 'average_relevant_docs_per_query': 78.94, 'max_relevant_docs_per_query': 347, 'unique_relevant_docs': 788}} | +| [ROxfordEasyI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 5063} | {'test': {'number_of_characters': 0, 'num_samples': 5063, 'num_queries': 70, 'num_documents': 4993, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 4993, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 0, 'average_relevant_docs_per_query': 43.27, 'max_relevant_docs_per_query': 248, 'unique_relevant_docs': 4993}} | +| [ROxfordHardI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 5063} | {'test': {'number_of_characters': 0, 'num_samples': 5063, 'num_queries': 70, 'num_documents': 4993, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 4993, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 35.67, 'max_relevant_docs_per_query': 284, 'unique_relevant_docs': 4993}} | +| [ROxfordMediumI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 5063} | {'test': {'number_of_characters': 0, 'num_samples': 5063, 'num_queries': 70, 'num_documents': 4993, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 4993, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 2, 'average_relevant_docs_per_query': 78.94, 'max_relevant_docs_per_query': 347, 'unique_relevant_docs': 4993}} | | [RP2kI2IRetrieval](https://arxiv.org/abs/2006.12634) (Peng et al., 2020) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 77643} | {'test': {'number_of_characters': 0, 'num_samples': 77643, 'num_queries': 38186, 'num_documents': 39457, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 39457, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 38186, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 115.47, 'max_relevant_docs_per_query': 1069, 'unique_relevant_docs': 38181}} | -| [RParisEasyI2IMultiChoice](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 6392} | {'test': {'number_of_characters': 0, 'num_samples': 6392, 'num_queries': 70, 'num_documents': 6322, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 6322, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 2, 'average_relevant_docs_per_query': 98.2, 'max_relevant_docs_per_query': 199, 'unique_relevant_docs': 6322}} | -| [RParisEasyI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 1540} | {'test': {'number_of_characters': 0, 'num_samples': 1540, 'num_queries': 70, 'num_documents': 1470, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 1470, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_relevant_docs_per_query': 2, 'average_relevant_docs_per_query': 98.2, 'max_relevant_docs_per_query': 199, 'unique_relevant_docs': 1470}} | -| [RParisHardI2IMultiChoice](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 6392} | {'test': {'number_of_characters': 0, 'num_samples': 6392, 'num_queries': 70, 'num_documents': 6322, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 6322, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 34, 'average_relevant_docs_per_query': 147.86, 'max_relevant_docs_per_query': 556, 'unique_relevant_docs': 6322}} | -| [RParisHardI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 2048} | {'test': {'number_of_characters': 0, 'num_samples': 2048, 'num_queries': 70, 'num_documents': 1978, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 1978, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_relevant_docs_per_query': 34, 'average_relevant_docs_per_query': 147.86, 'max_relevant_docs_per_query': 556, 'unique_relevant_docs': 1978}} | -| [RParisMediumI2IMultiChoice](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 6392} | {'test': {'number_of_characters': 0, 'num_samples': 6392, 'num_queries': 70, 'num_documents': 6322, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 6322, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 76, 'average_relevant_docs_per_query': 246.06, 'max_relevant_docs_per_query': 636, 'unique_relevant_docs': 6322}} | -| [RParisMediumI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 2721} | {'test': {'number_of_characters': 0, 'num_samples': 2721, 'num_queries': 70, 'num_documents': 2651, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2651, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_relevant_docs_per_query': 76, 'average_relevant_docs_per_query': 246.06, 'max_relevant_docs_per_query': 636, 'unique_relevant_docs': 2651}} | +| [RParisEasyI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 6392} | {'test': {'number_of_characters': 0, 'num_samples': 6392, 'num_queries': 70, 'num_documents': 6322, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 6322, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 2, 'average_relevant_docs_per_query': 98.2, 'max_relevant_docs_per_query': 199, 'unique_relevant_docs': 6322}} | +| [RParisHardI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 6392} | {'test': {'number_of_characters': 0, 'num_samples': 6392, 'num_queries': 70, 'num_documents': 6322, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 6322, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 34, 'average_relevant_docs_per_query': 147.86, 'max_relevant_docs_per_query': 556, 'unique_relevant_docs': 6322}} | +| [RParisMediumI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 6392} | {'test': {'number_of_characters': 0, 'num_samples': 6392, 'num_queries': 70, 'num_documents': 6322, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 6322, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 76, 'average_relevant_docs_per_query': 246.06, 'max_relevant_docs_per_query': 636, 'unique_relevant_docs': 6322}} | | [RTE3](https://aclanthology.org/W07-1401/) | ['deu', 'eng', 'fra', 'ita'] | PairClassification | s2s | [Encyclopaedic, News, Web, Written] | None | None | | [RUParaPhraserSTS](https://aclanthology.org/2020.ngt-1.6) (Pivovarova et al., 2017) | ['rus'] | STS | s2s | [News, Written] | None | None | | [ReMuQIT2TRetrieval](https://github.com/luomancs/ReMuQ) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | {'test': 142403} | {'test': {'number_of_characters': 29161615, 'num_samples': 142403, 'num_queries': 3609, 'num_documents': 138794, 'min_document_length': 9, 'average_document_length': 208.19, 'max_document_length': 508, 'unique_documents': 138794, 'num_document_images': 0, 'min_query_length': 18, 'average_query_length': 73.86, 'max_query_length': 218, 'unique_queries': 3608, 'num_query_images': 3609, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3607}} | @@ -822,6 +818,7 @@ The following tables give you an overview of the tasks in MTEB. | [UkrFormalityClassification](https://huggingface.co/datasets/ukr-detect/ukr-formality-dataset-translated-gyafc) | ['ukr'] | Classification | s2s | [News, Written] | None | None | | [UnfairTOSLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [UrduRomanSentimentClassification](https://archive.ics.uci.edu/dataset/458/roman+urdu+data+set) (Sharf,Zareen, 2018) | ['urd'] | Classification | s2s | [Social, Written] | None | None | +| [VDRMultilingualRetrieval](https://huggingface.co/datasets/llamaindex/vdr-multilingual-test) (LlamaIndex, 2025) | ['deu', 'eng', 'fra', 'ita', 'spa'] | Retrieval | it2it | [Web] | None | None | | [VGHierarchicalClusteringP2P](https://huggingface.co/datasets/navjordj/VG_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | p2p | [News, Non-fiction, Written] | None | None | | [VGHierarchicalClusteringS2S](https://huggingface.co/datasets/navjordj/VG_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | p2p | [News, Non-fiction, Written] | None | None | | [VOC2007](http://host.robots.ox.ac.uk/pascal/VOC/) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 4952} | {'test': {'num_samples': 4952, 'min_image_width': 148, 'average_image_width': 471.25, 'max_image_width': 500, 'min_image_height': 139, 'average_image_height': 381.54, 'max_image_height': 500, 'min_labels_per_sample': 1, 'average_label_per_sample': 1.42, 'max_labels_per_sample': 5, 'unique_num_labels': 20, 'labels': {'14': {'count': 2007}, '11': {'count': 418}, '18': {'count': 259}, '17': {'count': 223}, '8': {'count': 417}, '6': {'count': 721}, '15': {'count': 224}, '10': {'count': 190}, '12': {'count': 274}, '7': {'count': 322}, '9': {'count': 127}, '5': {'count': 174}, '1': {'count': 239}, '13': {'count': 222}, '2': {'count': 282}, '19': {'count': 229}, '16': {'count': 97}, '0': {'count': 204}, '3': {'count': 172}, '4': {'count': 212}}}} | @@ -907,7 +904,7 @@ The following tables give you an overview of the tasks in MTEB.
-| ISO Code | Language | Family | Any2AnyMultiChoice | Any2AnyMultilingualRetrieval | Any2AnyRetrieval | BitextMining | Classification | Clustering | Compositionality | DocumentUnderstanding | ImageClassification | ImageClustering | ImageMultilabelClassification | InstructionRetrieval | MultilabelClassification | PairClassification | Reranking | Retrieval | STS | Speed | Summarization | VisionCentric | VisualSTS(eng) | VisualSTS(multi) | ZeroShotClassification | Sum | +| ISO Code | Language | Family | Any2AnyMultiChoice | Any2AnyMultilingualRetrieval | Any2AnyRetrieval | BitextMining | Classification | Clustering | Compositionality | DocumentUnderstanding | ImageClassification | ImageClustering | ImageMultilabelClassification | InstructionRetrieval | MultilabelClassification | PairClassification | Reranking | Retrieval | STS | Speed | Summarization | VisionCentricQA | VisualSTS(eng) | VisualSTS(multi) | ZeroShotClassification | Sum | |---|------|------|------|------|------|------|------|------|------|------|------|------|------|------|------|------|------|------|------|------|------|------|------|---| | aai | Arifama-Miniafia | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | aak | Ankave | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1160,7 +1157,7 @@ The following tables give you an overview of the tasks in MTEB. | dah | Gwahatike | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | dan | Danish | Indo-European | 0 | 2 | 0 | 7 | 9 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | | ded | Dedua | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| deu | German | Indo-European | 0 | 2 | 0 | 8 | 14 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 7 | 2 | 19 | 4 | 0 | 0 | 0 | 0 | 4 | 0 | 68 | +| deu | German | Indo-European | 0 | 2 | 0 | 8 | 14 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 7 | 2 | 20 | 4 | 0 | 0 | 0 | 0 | 4 | 0 | 69 | | dgc | Casiguran Dumagat Agta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | dgr | Dogrib | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | dgz | Daga | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1188,7 +1185,7 @@ The following tables give you an overview of the tasks in MTEB. | ell | Modern Greek (1453-) | Indo-European | 0 | 2 | 0 | 5 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | | emi | Mussau-Emira | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | emp | Northern Emberá | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eng | English | Indo-European | 0 | 3 | 55 | 19 | 160 | 21 | 7 | 10 | 22 | 5 | 0 | 3 | 1 | 13 | 9 | 112 | 13 | 2 | 1 | 6 | 7 | 3 | 24 | 496 | +| eng | English | Indo-European | 0 | 3 | 49 | 19 | 160 | 21 | 7 | 10 | 22 | 5 | 0 | 3 | 1 | 13 | 9 | 113 | 13 | 2 | 1 | 6 | 7 | 3 | 24 | 491 | | enq | Enga | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | epo | Esperanto | Artificial Language | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | | eri | Ogea | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1209,7 +1206,7 @@ The following tables give you an overview of the tasks in MTEB. | fin | Finnish | Uralic | 0 | 1 | 0 | 5 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 2 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 23 | | fon | Fon | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | | for | Fore | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fra | French | Indo-European | 0 | 1 | 0 | 9 | 13 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 6 | 3 | 16 | 4 | 0 | 1 | 0 | 0 | 4 | 0 | 66 | +| fra | French | Indo-European | 0 | 1 | 0 | 9 | 13 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 6 | 3 | 17 | 4 | 0 | 1 | 0 | 0 | 4 | 0 | 67 | | fry | Western Frisian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | fuc | Pulaar | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | fue | Borgu Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1304,7 +1301,7 @@ The following tables give you an overview of the tasks in MTEB. | ipi | Ipili | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | isl | Icelandic | Indo-European | 0 | 0 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | | isn | Isanzu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ita | Italian | Indo-European | 0 | 1 | 0 | 7 | 9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | 6 | 3 | 0 | 0 | 0 | 0 | 4 | 0 | 35 | +| ita | Italian | Indo-European | 0 | 1 | 0 | 7 | 9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | 7 | 3 | 0 | 0 | 0 | 0 | 4 | 0 | 36 | | iws | Sepik Iwam | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | ixl | Ixil | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | jac | Popti' | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1371,7 +1368,7 @@ The following tables give you an overview of the tasks in MTEB. | knj | Western Kanjobal | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | knv | Tabo | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | kon | Kongo | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kor | Korean | Koreanic | 0 | 2 | 0 | 6 | 8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 1 | 10 | 3 | 0 | 0 | 0 | 0 | 2 | 0 | 37 | +| kor | Korean | Koreanic | 0 | 2 | 0 | 6 | 8 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 1 | 10 | 3 | 0 | 0 | 0 | 0 | 2 | 0 | 39 | | kos | Kosraean | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | kpf | Komba | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | kpg | Kapingamarangi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1735,7 +1732,7 @@ The following tables give you an overview of the tasks in MTEB. | soq | Kanasi | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | sot | Southern Sotho | Atlantic-Congo | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | | soy | Miyobe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spa | Spanish | Indo-European | 0 | 2 | 0 | 6 | 13 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 2 | 14 | 4 | 0 | 0 | 0 | 0 | 4 | 0 | 53 | +| spa | Spanish | Indo-European | 0 | 2 | 0 | 6 | 13 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 2 | 15 | 4 | 0 | 0 | 0 | 0 | 4 | 0 | 54 | | spl | Selepet | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | spm | Akukem | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | spp | Supyire Senoufo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1961,7 +1958,7 @@ The following tables give you an overview of the tasks in MTEB. | zty | Yatee Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zul | Zulu | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | | zyp | Zyphe Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| Total | None | None | None | 0 | 55 | 55 | 1492 | 836 | 314 | 7 | 10 | 22 | 5 | 0 | 3 | 28 | 91 | 56 | 586 | 88 | 2 | 2 | 6 | 7 | 37 | 24 | +| Total | None | None | None | 0 | 55 | 49 | 1492 | 836 | 316 | 7 | 10 | 22 | 5 | 0 | 3 | 28 | 91 | 56 | 591 | 88 | 2 | 2 | 6 | 7 | 37 | 24 |
diff --git a/mteb/abstasks/Image/AbsTaskAny2AnyMultiChoice.py b/mteb/abstasks/Image/AbsTaskAny2AnyMultiChoice.py index b65ae375f4..56b1797571 100644 --- a/mteb/abstasks/Image/AbsTaskAny2AnyMultiChoice.py +++ b/mteb/abstasks/Image/AbsTaskAny2AnyMultiChoice.py @@ -656,3 +656,272 @@ def process_docs( return { f"{split}_{hf_subset}_{k}": v for k, v in collection[hf_subset][split].items() } + + +class MultiChoiceEvaluationMixin: + """A mixin class to enable retrieval tasks to use multiple-choice evaluator; + It is designed for tasks like r-Oxford and r-Pairs that + require masking out different documents in the corpus for each query. + + example usage: + class ROxfordHardI2IRetrieval(MultiChoiceEvaluationMixin, AbsTaskAny2AnyRetrieval): + + It is for overriding `def evaluate`, `def _evaluate_subset` + and `def _calculate_metrics_from_split` of AbsTaskAny2AnyRetrieval. + """ + + def evaluate( + self, + model, + split: str = "test", + *, + encode_kwargs: dict[str, Any] = None, + **kwargs, + ): + # Use Any2AnyMultiChoiceEvaluator instead of Any2AnyRetrievalEvaluator + evaluator = Any2AnyMultiChoiceEvaluator( + retriever=model, + task_name=self.metadata.name, + encode_kwargs=encode_kwargs if encode_kwargs is not None else {}, + **kwargs, + ) + + scores = {} + hf_subsets = list(self.hf_subsets) if self.is_multilingual else ["default"] + + for hf_subset in hf_subsets: + logger.info(f"Subset: {hf_subset}") + + if hf_subset == "default": + corpus, queries, relevant_docs = ( + self.corpus[split], + self.queries[split], + self.relevant_docs[split], + ) + else: + corpus, queries, relevant_docs = ( + self.corpus[hf_subset][split], + self.queries[hf_subset][split], + self.relevant_docs[hf_subset][split], + ) + scores[hf_subset] = self._evaluate_subset( + evaluator, corpus, queries, relevant_docs, hf_subset, **kwargs + ) + return scores + + def _evaluate_subset( + self, retriever, corpus, queries, relevant_docs, hf_subset: str, **kwargs + ): + start_time = time() + results = retriever(corpus, queries, relevant_docs) + end_time = time() + logger.info(f"Time taken to retrieve: {end_time - start_time:.2f} seconds") + + save_predictions = kwargs.get("save_predictions", False) + export_errors = kwargs.get("export_errors", False) + if save_predictions or export_errors: + output_folder = Path(kwargs.get("output_folder", "results")) + if not os.path.isdir(output_folder): + os.makedirs(output_folder) + + if save_predictions: + top_k = kwargs.get("top_k", None) + if top_k is not None: + for qid in list(results.keys()): + doc_ids = set( + sorted( + results[qid], key=lambda x: results[qid][x], reverse=True + )[:top_k] + ) + results[qid] = { + k: v for k, v in results[qid].items() if k in doc_ids + } + qrels_save_path = ( + output_folder / f"{self.metadata.name}_{hf_subset}_predictions.json" + ) + + with open(qrels_save_path, "w") as f: + json.dump(results, f) + + ndcg, _map, recall, precision, cv_recall, naucs = retriever.evaluate( + relevant_docs, + results, + retriever.k_values, + ignore_identical_ids=self.ignore_identical_ids, + skip_first_result=self.skip_first_result, + ) + mrr, naucs_mrr = retriever.evaluate_custom( + relevant_docs, results, retriever.k_values, "mrr" + ) + scores = { + **{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()}, + **{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()}, + **{f"recall_at_{k.split('@')[1]}": v for (k, v) in recall.items()}, + **{f"cv_recall_at_{k.split('@')[1]}": v for (k, v) in cv_recall.items()}, + **{f"precision_at_{k.split('@')[1]}": v for (k, v) in precision.items()}, + **{f"mrr_at_{k.split('@')[1]}": v for (k, v) in mrr.items()}, + **{ + k.replace("@", "_at_").replace("_P", "_precision").lower(): v + for k, v in naucs.items() + }, + **{ + k.replace("@", "_at_").replace("_P", "_precision").lower(): v + for k, v in naucs_mrr.items() + }, + "accuracy": recall["Recall@1"], + } + self._add_main_score(scores) + + if export_errors: + errors = {} + + top_k = kwargs.get("top_k", 1) + if not save_predictions and top_k == 1: + for qid in results.keys(): + doc_scores = results[qid] + sorted_docs = sorted( + doc_scores.items(), key=lambda x: x[1], reverse=True + )[:top_k] + results[qid] = dict(sorted_docs) + for qid, retrieved_docs in results.items(): + expected_docs = relevant_docs[qid] + false_positives = [ + doc for doc in retrieved_docs if doc not in expected_docs + ] + false_negatives = [ + doc for doc in expected_docs if doc not in retrieved_docs + ] + if false_positives or false_negatives: + errors[qid] = { + "false_positives": false_positives, + "false_negatives": false_negatives, + } + + errors_save_path = ( + output_folder / f"{self.metadata.name}_{hf_subset}_errors.json" + ) + with open(errors_save_path, "w") as f: + json.dump(errors, f) + + return scores + + def _calculate_metrics_from_split( + self, split: str, hf_subset: str | None = None, compute_overall: bool = False + ) -> Any2AnyMutipleChoiceDescriptiveStatistics: + if hf_subset: + queries = self.queries[hf_subset][split] + corpus = self.corpus[hf_subset][split] + relevant_docs = self.relevant_docs[hf_subset][split] + elif compute_overall: + queries = {} + corpus = {} + relevant_docs = {} + for hf_subset in self.metadata.eval_langs: + queries.update(process_docs(self.queries, hf_subset, split)) + corpus.update(process_docs(self.corpus, hf_subset, split)) + relevant_docs.update( + process_relevant_docs(self.relevant_docs, hf_subset, split) + ) + else: + queries = self.queries[split] + corpus = self.corpus[split] + relevant_docs = self.relevant_docs[split] + + queries_lens, doc_lens = [], [] + num_query_images = 0 + num_document_images = 0 + + q_modality = queries[0]["modality"] + unique_queries = len(set(queries["text"])) if "text" in q_modality else 0 + + for query in tqdm.tqdm(queries, desc="queries:"): + if "text" in q_modality: + text_query = query["text"] + queries_lens.append(len(text_query)) + if "image" in q_modality: + num_query_images += 1 + + d_modality = corpus[0]["modality"] + unique_documents = len(set(corpus["text"])) if "text" in d_modality else 0 + + for doc in tqdm.tqdm(corpus, desc="docs:"): + if "text" in d_modality: + text_doc = doc["text"] + doc_lens.append(len(text_doc)) + if "image" in d_modality: + num_document_images += 1 + + total_doc_len = sum(doc_lens) + total_query_len = sum(queries_lens) + num_documents = len(corpus) + num_queries = len(queries) + + d_modality = corpus[0]["modality"] + imgs = [doc["image"] for doc in corpus if "image" in d_modality] + d_img_widths, d_img_heights = [], [] + for img in imgs: + width, height = img.size + d_img_widths.append(height) + d_img_heights.append(width) + + q_modality = queries[0]["modality"] + imgs = [query["image"] for query in queries if "image" in q_modality] + q_img_widths, q_img_heights = [], [] + for img in imgs: + width, height = img.size + q_img_widths.append(height) + q_img_heights.append(width) + + # create a list of number of relevant docs per query + queries_set = set(queries["id"]) + qrels_lengths = [ + len([v for k, v in relevant_docs[qid].items() if v != 0]) + for qid in tqdm.tqdm(relevant_docs.keys(), desc="qrels:") + if qid in queries_set + ] + num_qrels = sum(qrels_lengths) + qrels_per_doc = num_qrels / len(relevant_docs) if num_queries else 0 + unique_qrels = len({doc for qid in relevant_docs for doc in relevant_docs[qid]}) + + return Any2AnyMutipleChoiceDescriptiveStatistics( + number_of_characters=total_query_len + total_doc_len, + num_samples=num_documents + num_queries, + num_queries=num_queries, + num_documents=num_documents, + min_document_length=min(doc_lens) if doc_lens else 0, + average_document_length=total_doc_len / len(doc_lens) if doc_lens else 0, + max_document_length=max(doc_lens) if doc_lens else 0, + unique_documents=unique_documents, + min_document_image_width=min(d_img_widths) if d_img_widths else 0, + average_document_image_width=sum(d_img_widths) / len(d_img_widths) + if d_img_widths + else 0, + max_document_image_width=max(d_img_widths) if d_img_widths else 0, + min_document_image_height=min(d_img_heights) if d_img_heights else 0, + average_document_image_height=sum(d_img_heights) / len(d_img_heights) + if d_img_heights + else 0, + max_document_image_height=max(d_img_heights) if d_img_heights else 0, + num_document_images=num_document_images, + min_query_length=min(queries_lens) if queries_lens else 0, + average_query_length=total_query_len / len(queries_lens) + if queries_lens + else 0, + max_query_length=max(queries_lens) if queries_lens else 0, + unique_queries=unique_queries, + num_query_images=num_query_images, + min_query_image_width=min(q_img_widths) if q_img_widths else 0, + average_query_image_width=sum(q_img_widths) / len(q_img_widths) + if q_img_widths + else 0, + max_query_image_width=max(q_img_widths) if q_img_widths else 0, + min_query_image_height=min(q_img_heights) if q_img_heights else 0, + average_query_image_height=sum(q_img_heights) / len(q_img_heights) + if q_img_heights + else 0, + max_query_image_height=max(q_img_heights) if q_img_heights else 0, + min_relevant_docs_per_query=min(qrels_lengths), + average_relevant_docs_per_query=qrels_per_doc, + max_relevant_docs_per_query=max(qrels_lengths), + unique_relevant_docs=unique_qrels, + ) diff --git a/mteb/abstasks/Image/AbsTaskAny2TextMultipleChoice.py b/mteb/abstasks/Image/AbsTaskAny2TextMultipleChoice.py deleted file mode 100644 index fda73d457e..0000000000 --- a/mteb/abstasks/Image/AbsTaskAny2TextMultipleChoice.py +++ /dev/null @@ -1,145 +0,0 @@ -from __future__ import annotations - -import logging -from collections import Counter -from typing import Any - -from datasets import Dataset - -from ...encoder_interface import Encoder -from ...evaluation.evaluators import Any2TextMultipleChoiceEvaluator -from ..AbsTask import AbsTask, ScoresDict -from ..TaskMetadata import DescriptiveStatistics - -logger = logging.getLogger(__name__) - - -class Any2TextMutipleChoiceDescriptiveStatistics(DescriptiveStatistics): - """Descriptive statistics for Any2TextMutipleChoice - - Attributes: - num_samples: number of samples in the dataset. - - min_image_width: Minimum width of images - average_image_width: Average width of images - max_image_width: Maximum width of images - - min_image_height: Minimum height of images - average_image_height: Average height of images - max_image_height: Maximum height of images - - min_num_choices: Minimum number of choices - average_num_choices: Average number of choices - max_num_choices: Maximum number of choices - - answers: dict of answer frequencies - - min_question_length: Minimum length of questions - average_question_length: Average length of questions - max_question_length: Maximum length of questions - """ - - num_samples: int - - min_image_width: float - average_image_width: float - max_image_width: float - - min_image_height: float - average_image_height: float - max_image_height: float - - min_num_choices: int - average_num_choices: float - max_num_choices: int - - answers: dict[str, dict[str, int]] - - min_question_length: int - average_question_length: float - max_question_length: int - - -class AbsTaskAny2TextMultipleChoice(AbsTask): - """Abstract class for Any to Text Multiple Choice tasks, - where the queries and be either text or image, or both. - This task assess interleaved encoding of queries, - the similarity computed between the queries and the candidate choices is ranked. - - self.load_data() must generate a huggingface dataset with a split matching self.metadata.eval_splits, and assign it to self.dataset. - """ - - query_modalities: list[str] | str = ["image", "text"] - query_column_names: dict = {"image": "image", "text": "question"} - label_column_name: str = "answer" - choices_column_name: str = "choices" - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - def _add_main_score(self, scores) -> None: - scores["main_score"] = scores[self.metadata.main_score] - - def _calculate_metrics_from_split( - self, split: str, hf_subset: str | None = None, compute_overall: bool = False - ) -> Any2TextMutipleChoiceDescriptiveStatistics: - imgs = self.dataset[split][self.query_column_names["image"]] - questions = self.dataset[split][self.query_column_names["text"]] - choices = self.dataset[split][self.choices_column_name] - answers = self.dataset[split][self.label_column_name] - - num_samples = len(answers) - answer_count = Counter(answers) - img_widths, img_heights = [], [] - for img in imgs: - width, height = img.size - img_heights.append(height) - img_widths.append(width) - - choices_len = [len(c) for c in choices] - questions_len = [len(q) for q in questions] - - return Any2TextMutipleChoiceDescriptiveStatistics( - num_samples=num_samples, - min_image_width=min(img_widths), - average_image_width=sum(img_widths) / len(img_widths), - max_image_width=max(img_widths), - min_image_height=min(img_heights), - average_image_height=sum(img_heights) / len(img_heights), - max_image_height=max(img_heights), - min_num_choices=min(choices_len), - average_num_choices=sum(choices_len) / len(choices_len), - max_num_choices=max(choices_len), - min_question_length=min(questions_len), - average_question_length=sum(questions_len) / len(questions_len), - max_question_length=max(questions_len), - answers={ - str(answer): {"count": count} for answer, count in answer_count.items() - }, - ) - - def _evaluate_subset( - self, - model: Encoder, - dataset: Dataset, - *, - encode_kwargs: dict[str, Any] = {}, - **kwargs, - ) -> ScoresDict: - for modality in self.query_modalities: - if modality not in self.query_column_names: - raise KeyError( - f"query column name of modality {modality} is not defined" - ) - evaluator = Any2TextMultipleChoiceEvaluator( - dataset, - query_modalities=self.query_modalities, - query_column_names=self.query_column_names, - label_column_name=self.label_column_name, - choices_column_name=self.choices_column_name, - task_name=self.metadata.name, - **kwargs, - ) - scores = evaluator(model, encode_kwargs=encode_kwargs) - self._add_main_score(scores) - return scores diff --git a/mteb/abstasks/Image/__init__.py b/mteb/abstasks/Image/__init__.py index 70c453bcef..d4c5807ce9 100644 --- a/mteb/abstasks/Image/__init__.py +++ b/mteb/abstasks/Image/__init__.py @@ -2,7 +2,6 @@ from .AbsTaskAny2AnyMultiChoice import AbsTaskAny2AnyMultiChoice from .AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval -from .AbsTaskAny2TextMultipleChoice import AbsTaskAny2TextMultipleChoice from .AbsTaskImageClassification import AbsTaskImageClassification from .AbsTaskImageClustering import AbsTaskImageClustering from .AbsTaskImageMultilabelClassification import AbsTaskImageMultilabelClassification @@ -17,7 +16,6 @@ "AbsTaskImageMultilabelClassification", "AbsTaskImageClustering", "AbsTaskImageClassification", - "AbsTaskAny2TextMultipleChoice", "AbsTaskAny2AnyRetrieval", "AbsTaskAny2AnyMultiChoice", ] diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index f187c736d1..10d8b32b22 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -103,7 +103,7 @@ "Any2AnyMultiChoice", "Any2AnyRetrieval", "Any2AnyMultilingualRetrieval", - "VisionCentric", + "VisionCentricQA", "ImageClustering", "ImageClassification", "ImageMultilabelClassification", diff --git a/mteb/abstasks/__init__.py b/mteb/abstasks/__init__.py index 9b79e4d565..91d8cdca67 100644 --- a/mteb/abstasks/__init__.py +++ b/mteb/abstasks/__init__.py @@ -14,7 +14,6 @@ from .Image import ( AbsTaskAny2AnyMultiChoice, AbsTaskAny2AnyRetrieval, - AbsTaskAny2TextMultipleChoice, AbsTaskImageClassification, AbsTaskImageClustering, AbsTaskImageMultilabelClassification, @@ -39,7 +38,6 @@ "TaskMetadata", "AbsTaskAny2AnyMultiChoice", "AbsTaskAny2AnyRetrieval", - "AbsTaskAny2TextMultipleChoice", "AbsTaskImageClassification", "AbsTaskImageClustering", "AbsTaskImageMultilabelClassification", diff --git a/mteb/benchmarks/benchmark.py b/mteb/benchmarks/benchmark.py index 029cffb79f..e8d61d4e0b 100644 --- a/mteb/benchmarks/benchmark.py +++ b/mteb/benchmarks/benchmark.py @@ -47,6 +47,7 @@ class Benchmark: reference: UrlString | None = None citation: str | None = None contacts: list[str] | None = None + display_on_leaderboard: bool = True def __iter__(self): return iter(self.tasks) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index ea0a9a652b..05266063ce 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -13,17 +13,15 @@ ] # Allows the type to be a string, but ensures that the string is a URL -MMTEB_CITATION = """ -@article{enevoldsen2025mmtebmassivemultilingualtext, - title={MMTEB: Massive Multilingual Text Embedding Benchmark}, - author={Kenneth Enevoldsen and Isaac Chung and Imene Kerboua and Márton Kardos and Ashwin Mathur and David Stap and Jay Gala and Wissam Siblini and Dominik Krzemiński and Genta Indra Winata and Saba Sturua and Saiteja Utpala and Mathieu Ciancone and Marion Schaeffer and Gabriel Sequeira and Diganta Misra and Shreeya Dhakal and Jonathan Rystrøm and Roman Solomatin and Ömer Çağatan and Akash Kundu and Martin Bernstorff and Shitao Xiao and Akshita Sukhlecha and Bhavish Pahwa and Rafał Poświata and Kranthi Kiran GV and Shawon Ashraf and Daniel Auras and Björn Plüster and Jan Philipp Harries and Loïc Magne and Isabelle Mohr and Mariya Hendriksen and Dawei Zhu and Hippolyte Gisserot-Boukhlef and Tom Aarsen and Jan Kostkan and Konrad Wojtasik and Taemin Lee and Marek Šuppa and Crystina Zhang and Roberta Rocca and Mohammed Hamdy and Andrianos Michail and John Yang and Manuel Faysse and Aleksei Vatolin and Nandan Thakur and Manan Dey and Dipam Vasani and Pranjal Chitale and Simone Tedeschi and Nguyen Tai and Artem Snegirev and Michael Günther and Mengzhou Xia and Weijia Shi and Xing Han Lù and Jordan Clive and Gayatri Krishnakumar and Anna Maksimova and Silvan Wehrli and Maria Tikhonova and Henil Panchal and Aleksandr Abramov and Malte Ostendorff and Zheng Liu and Simon Clematide and Lester James Miranda and Alena Fenogenova and Guangyu Song and Ruqiya Bin Safi and Wen-Ding Li and Alessia Borghini and Federico Cassano and Hongjin Su and Jimmy Lin and Howard Yen and Lasse Hansen and Sara Hooker and Chenghao Xiao and Vaibhav Adlakha and Orion Weller and Siva Reddy and Niklas Muennighoff}, - publisher = {arXiv}, - journal={arXiv preprint arXiv:2502.13595}, - year={2025}, - url={https://arxiv.org/abs/2502.13595}, - doi = {10.48550/arXiv.2502.13595}, -} -""" +MMTEB_CITATION = """@article{enevoldsen2025mmtebmassivemultilingualtext, + title={MMTEB: Massive Multilingual Text Embedding Benchmark}, + author={Kenneth Enevoldsen and Isaac Chung and Imene Kerboua and Márton Kardos and Ashwin Mathur and David Stap and Jay Gala and Wissam Siblini and Dominik Krzemiński and Genta Indra Winata and Saba Sturua and Saiteja Utpala and Mathieu Ciancone and Marion Schaeffer and Gabriel Sequeira and Diganta Misra and Shreeya Dhakal and Jonathan Rystrøm and Roman Solomatin and Ömer Çağatan and Akash Kundu and Martin Bernstorff and Shitao Xiao and Akshita Sukhlecha and Bhavish Pahwa and Rafał Poświata and Kranthi Kiran GV and Shawon Ashraf and Daniel Auras and Björn Plüster and Jan Philipp Harries and Loïc Magne and Isabelle Mohr and Mariya Hendriksen and Dawei Zhu and Hippolyte Gisserot-Boukhlef and Tom Aarsen and Jan Kostkan and Konrad Wojtasik and Taemin Lee and Marek Šuppa and Crystina Zhang and Roberta Rocca and Mohammed Hamdy and Andrianos Michail and John Yang and Manuel Faysse and Aleksei Vatolin and Nandan Thakur and Manan Dey and Dipam Vasani and Pranjal Chitale and Simone Tedeschi and Nguyen Tai and Artem Snegirev and Michael Günther and Mengzhou Xia and Weijia Shi and Xing Han Lù and Jordan Clive and Gayatri Krishnakumar and Anna Maksimova and Silvan Wehrli and Maria Tikhonova and Henil Panchal and Aleksandr Abramov and Malte Ostendorff and Zheng Liu and Simon Clematide and Lester James Miranda and Alena Fenogenova and Guangyu Song and Ruqiya Bin Safi and Wen-Ding Li and Alessia Borghini and Federico Cassano and Hongjin Su and Jimmy Lin and Howard Yen and Lasse Hansen and Sara Hooker and Chenghao Xiao and Vaibhav Adlakha and Orion Weller and Siva Reddy and Niklas Muennighoff}, + publisher = {arXiv}, + journal={arXiv preprint arXiv:2502.13595}, + year={2025}, + url={https://arxiv.org/abs/2502.13595}, + doi = {10.48550/arXiv.2502.13595}, +}""" MTEB_EN = Benchmark( name="MTEB(eng, v2)", @@ -172,26 +170,17 @@ ), description="""The original English benchmark by Muennighoff et al., (2023). This page is an adaptation of the [old MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard_legacy). -We recommend that you use [MTEB(eng, v2)](http://mteb-leaderboard.hf.space/?benchmark_name=MTEB%28eng%2C+v2%29) instead as it uses updated versions of the task making it notably faster to run and resolves [a known bug](https://github.com/embeddings-benchmark/mteb/issues/1156) in existing tasks. This benchmark also removes datasets common for fine-tuning such as MSMARCO, which makes model performance scores more comparable. However, generally, both benchmarks provide similar estimates. +We recommend that you use [MTEB(eng, v2)](http://mteb-leaderboard.hf.space/?benchmark_name=MTEB%28eng%2C+v2%29) instead, as it uses updated versions of the task, making it notably faster to run and resolving [a known bug](https://github.com/embeddings-benchmark/mteb/issues/1156) in existing tasks. This benchmark also removes datasets common for fine-tuning, such as MSMARCO, which makes model performance scores more comparable. However, generally, both benchmarks provide similar estimates. """, - citation="""@inproceedings{muennighoff-etal-2023-mteb, - title = "{MTEB}: Massive Text Embedding Benchmark", - author = "Muennighoff, Niklas and - Tazi, Nouamane and - Magne, Loic and - Reimers, Nils", - editor = "Vlachos, Andreas and - Augenstein, Isabelle", - booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics", - month = may, - year = "2023", - address = "Dubrovnik, Croatia", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2023.eacl-main.148", - doi = "10.18653/v1/2023.eacl-main.148", - pages = "2014--2037", -} -""", + citation="""@article{muennighoff2022mteb, + author = {Muennighoff, Niklas and Tazi, Nouamane and Magne, Lo{\"\i}c and Reimers, Nils}, + title = {MTEB: Massive Text Embedding Benchmark}, + publisher = {arXiv}, + journal={arXiv preprint arXiv:2210.07316}, + year = {2022} + url = {https://arxiv.org/abs/2210.07316}, + doi = {10.48550/ARXIV.2210.07316}, +}""", contacts=["Muennighoff"], ) @@ -235,15 +224,14 @@ description="A Russian version of the Massive Text Embedding Benchmark with a number of novel Russian tasks in all task categories of the original MTEB.", reference="https://aclanthology.org/2023.eacl-main.148/", citation="""@misc{snegirev2024russianfocusedembeddersexplorationrumteb, - title={The Russian-focused embedders' exploration: ruMTEB benchmark and Russian embedding model design}, - author={Artem Snegirev and Maria Tikhonova and Anna Maksimova and Alena Fenogenova and Alexander Abramov}, - year={2024}, - eprint={2408.12503}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2408.12503}, -} -""", + title={The Russian-focused embedders' exploration: ruMTEB benchmark and Russian embedding model design}, + author={Artem Snegirev and Maria Tikhonova and Anna Maksimova and Alena Fenogenova and Alexander Abramov}, + year={2024}, + eprint={2408.12503}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2408.12503}, +}""", ) MTEB_RETRIEVAL_WITH_INSTRUCTIONS = Benchmark( @@ -258,12 +246,12 @@ description="Retrieval w/Instructions is the task of finding relevant documents for a query that has detailed instructions.", reference="https://arxiv.org/abs/2403.15246", citation="""@misc{weller2024followir, - title={FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions}, - author={Orion Weller and Benjamin Chang and Sean MacAvaney and Kyle Lo and Arman Cohan and Benjamin Van Durme and Dawn Lawrie and Luca Soldaini}, - year={2024}, - eprint={2403.15246}, - archivePrefix={arXiv}, - primaryClass={cs.IR} + title={FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions}, + author={Orion Weller and Benjamin Chang and Sean MacAvaney and Kyle Lo and Arman Cohan and Benjamin Van Durme and Dawn Lawrie and Luca Soldaini}, + year={2024}, + eprint={2403.15246}, + archivePrefix={arXiv}, + primaryClass={cs.IR} }""", ) @@ -327,14 +315,12 @@ including bitext mining and classification via retrieval-augmented contexts. """, reference="https://arxiv.org/pdf/2406.07424", - citation=""" - @article{winata2024miners, + citation="""@article{winata2024miners, title={MINERS: Multilingual Language Models as Semantic Retrievers}, author={Winata, Genta Indra and Zhang, Ruochen and Adelani, David Ifeoluwa}, journal={arXiv preprint arXiv:2406.07424}, year={2024} - } - """, +}""", ) SEB = Benchmark( @@ -379,11 +365,11 @@ description="A curated selection of tasks coverering the Scandinavian languages; Danish, Swedish and Norwegian, including Bokmål and Nynorsk.", reference="https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/", citation="""@inproceedings{enevoldsen2024scandinavian, - title={The Scandinavian Embedding Benchmarks: Comprehensive Assessment of Multilingual and Monolingual Text Embedding}, - author={Enevoldsen, Kenneth and Kardos, M{\'a}rton and Muennighoff, Niklas and Nielbo, Kristoffer}, - booktitle={Advances in Neural Information Processing Systems}, - year={2024}, - url={https://nips.cc/virtual/2024/poster/97869} + title={The Scandinavian Embedding Benchmarks: Comprehensive Assessment of Multilingual and Monolingual Text Embedding}, + author={Enevoldsen, Kenneth and Kardos, M{\'a}rton and Muennighoff, Niklas and Nielbo, Kristoffer}, + booktitle={Advances in Neural Information Processing Systems}, + year={2024}, + url={https://nips.cc/virtual/2024/poster/97869} }""", contacts=["KennethEnevoldsen", "x-tabdeveloping", "Samoed"], ) @@ -407,14 +393,14 @@ description="CoIR: A Comprehensive Benchmark for Code Information Retrieval Models", reference="https://github.com/CoIR-team/coir", citation="""@misc{li2024coircomprehensivebenchmarkcode, - title={CoIR: A Comprehensive Benchmark for Code Information Retrieval Models}, - author={Xiangyang Li and Kuicai Dong and Yi Quan Lee and Wei Xia and Yichun Yin and Hao Zhang and Yong Liu and Yasheng Wang and Ruiming Tang}, - year={2024}, - eprint={2407.02883}, - archivePrefix={arXiv}, - primaryClass={cs.IR}, - url={https://arxiv.org/abs/2407.02883}, - }""", + title={CoIR: A Comprehensive Benchmark for Code Information Retrieval Models}, + author={Xiangyang Li and Kuicai Dong and Yi Quan Lee and Wei Xia and Yichun Yin and Hao Zhang and Yong Liu and Yasheng Wang and Ruiming Tang}, + year={2024}, + eprint={2407.02883}, + archivePrefix={arXiv}, + primaryClass={cs.IR}, + url={https://arxiv.org/abs/2407.02883}, +}""", ) RAR_b = Benchmark( @@ -443,11 +429,11 @@ description="A benchmark to evaluate reasoning capabilities of retrievers.", reference="https://arxiv.org/abs/2404.06347", citation="""@article{xiao2024rar, - title={RAR-b: Reasoning as Retrieval Benchmark}, - author={Xiao, Chenghao and Hudson, G Thomas and Al Moubayed, Noura}, - journal={arXiv preprint arXiv:2404.06347}, - year={2024} - }""", + title={RAR-b: Reasoning as Retrieval Benchmark}, + author={Xiao, Chenghao and Hudson, G Thomas and Al Moubayed, Noura}, + journal={arXiv preprint arXiv:2404.06347}, + year={2024} +}""", contacts=["gowitheflow-1998"], ) @@ -494,13 +480,13 @@ description="MTEB-French, a French expansion of the original benchmark with high-quality native French datasets.", reference="https://arxiv.org/abs/2405.20468", citation="""@misc{ciancone2024mtebfrenchresourcesfrenchsentence, - title={MTEB-French: Resources for French Sentence Embedding Evaluation and Analysis}, - author={Mathieu Ciancone and Imene Kerboua and Marion Schaeffer and Wissam Siblini}, - year={2024}, - eprint={2405.20468}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2405.20468}, + title={MTEB-French: Resources for French Sentence Embedding Evaluation and Analysis}, + author={Mathieu Ciancone and Imene Kerboua and Marion Schaeffer and Wissam Siblini}, + year={2024}, + eprint={2405.20468}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2405.20468}, }""", contacts=["imenelydiaker"], ) @@ -541,13 +527,13 @@ description="A benchmark for text-embedding performance in German.", reference="https://arxiv.org/html/2401.02709v1", citation="""@misc{wehrli2024germantextembeddingclustering, - title={German Text Embedding Clustering Benchmark}, - author={Silvan Wehrli and Bert Arnrich and Christopher Irrgang}, - year={2024}, - eprint={2401.02709}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2401.02709}, + title={German Text Embedding Clustering Benchmark}, + author={Silvan Wehrli and Bert Arnrich and Christopher Irrgang}, + year={2024}, + eprint={2401.02709}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2401.02709}, }""", contacts=["slvnwhrl"], ) @@ -1068,10 +1054,10 @@ """, # Pieced together from paper abstract. reference="https://arxiv.org/abs/2404.12096v2", citation="""@article{zhu2024longembed, - title={LongEmbed: Extending Embedding Models for Long Context Retrieval}, - author={Zhu, Dawei and Wang, Liang and Yang, Nan and Song, Yifan and Wu, Wenhao and Wei, Furu and Li, Sujian}, - journal={arXiv preprint arXiv:2404.12096}, - year={2024} + title={LongEmbed: Extending Embedding Models for Long Context Retrieval}, + author={Zhu, Dawei and Wang, Liang and Yang, Nan and Song, Yifan and Wu, Wenhao and Wei, Furu and Li, Sujian}, + journal={arXiv preprint arXiv:2404.12096}, + year={2024} }""", ) @@ -1087,10 +1073,10 @@ """, reference="https://brightbenchmark.github.io/", citation="""@article{su2024bright, - title={Bright: A realistic and challenging benchmark for reasoning-intensive retrieval}, - author={Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others}, - journal={arXiv preprint arXiv:2407.12883}, - year={2024} + title={Bright: A realistic and challenging benchmark for reasoning-intensive retrieval}, + author={Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others}, + journal={arXiv preprint arXiv:2407.12883}, + year={2024} }""", ) @@ -1114,10 +1100,10 @@ """, reference="https://brightbenchmark.github.io/", citation="""@article{su2024bright, - title={Bright: A realistic and challenging benchmark for reasoning-intensive retrieval}, - author={Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others}, - journal={arXiv preprint arXiv:2407.12883}, - year={2024} + title={Bright: A realistic and challenging benchmark for reasoning-intensive retrieval}, + author={Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others}, + journal={arXiv preprint arXiv:2407.12883}, + year={2024} }""", ) @@ -1134,14 +1120,15 @@ description="A benchmark for evaluating code retrieval augmented generation, testing models' ability to retrieve relevant programming solutions, tutorials and documentation.", reference="https://arxiv.org/abs/2406.14497", citation="""@misc{wang2024coderagbenchretrievalaugmentcode, - title={CodeRAG-Bench: Can Retrieval Augment Code Generation?}, - author={Zora Zhiruo Wang and Akari Asai and Xinyan Velocity Yu and Frank F. Xu and Yiqing Xie and Graham Neubig and Daniel Fried}, - year={2024}, - eprint={2406.14497}, - archivePrefix={arXiv}, - primaryClass={cs.SE}, - url={https://arxiv.org/abs/2406.14497}, - }""", + title={CodeRAG-Bench: Can Retrieval Augment Code Generation?}, + author={Zora Zhiruo Wang and Akari Asai and Xinyan Velocity Yu and Frank F. Xu and Yiqing Xie and Graham Neubig and Daniel Fried}, + year={2024}, + eprint={2406.14497}, + archivePrefix={arXiv}, + primaryClass={cs.SE}, + url={https://arxiv.org/abs/2406.14497}, +}""", + display_on_leaderboard=False, ) BEIR = Benchmark( @@ -1168,10 +1155,10 @@ description="BEIR is a heterogeneous benchmark containing diverse IR tasks. It also provides a common and easy framework for evaluation of your NLP-based retrieval models within the benchmark.", reference="https://arxiv.org/abs/2104.08663", citation="""@article{thakur2021beir, - title={Beir: A heterogenous benchmark for zero-shot evaluation of information retrieval models}, - author={Thakur, Nandan and Reimers, Nils and R{\"u}ckl{\'e}, Andreas and Srivastava, Abhishek and Gurevych, Iryna}, - journal={arXiv preprint arXiv:2104.08663}, - year={2021} + title={Beir: A heterogenous benchmark for zero-shot evaluation of information retrieval models}, + author={Thakur, Nandan and Reimers, Nils and R{\"u}ckl{\'e}, Andreas and Srivastava, Abhishek and Gurevych, Iryna}, + journal={arXiv preprint arXiv:2104.08663}, + year={2021} } """, ) @@ -1253,12 +1240,12 @@ description="The Chinese Massive Text Embedding Benchmark (C-MTEB) is a comprehensive benchmark for Chinese text embeddings covering 6 tasks and 35 datasets.", reference="https://github.com/FlagOpen/FlagEmbedding/tree/master/research/C_MTEB", citation="""@misc{c-pack, - title={C-Pack: Packaged Resources To Advance General Chinese Embedding}, - author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff}, - year={2023}, - eprint={2309.07597}, - archivePrefix={arXiv}, - primaryClass={cs.CL} + title={C-Pack: Packaged Resources To Advance General Chinese Embedding}, + author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff}, + year={2023}, + eprint={2309.07597}, + archivePrefix={arXiv}, + primaryClass={cs.CL} }""", ) @@ -1412,12 +1399,12 @@ contacts=["nikolay-banar"], citation="""@misc{banar2024beirnlzeroshotinformationretrieval, title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, + author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + year={2024}, + eprint={2412.08329}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2412.08329}, }""", ) @@ -1476,22 +1463,21 @@ "STL10ZeroShot", "SUN397ZeroShot", "UCF101ZeroShot", - # Any2TextMutipleChoice + # Any2AnyMultipleChoice + "BLINKIT2IMultiChoice", + "BLINKIT2TMultiChoice", "CVBenchCount", "CVBenchRelation", "CVBenchDepth", "CVBenchDistance", - # Any2AnyMultipleChoice - "BLINKIT2IMultiChoice", - "BLINKIT2TMultiChoice", # Compositionality - "ImageCoDeT2IMultiChoice", "AROCocoOrder", "AROFlickrOrder", "AROVisualAttribution", "AROVisualRelation", "SugarCrepe", "Winoground", + "ImageCoDe", # VisualSTS "STS12VisualSTS", "STS13VisualSTS", @@ -1525,13 +1511,13 @@ "NIGHTSI2IRetrieval", "OVENIT2ITRetrieval", "OVENIT2TRetrieval", - "ROxfordEasyI2IMultiChoice", - "ROxfordMediumI2IMultiChoice", - "ROxfordHardI2IMultiChoice", + "ROxfordEasyI2IRetrieval", + "ROxfordMediumI2IRetrieval", + "ROxfordHardI2IRetrieval", "RP2kI2IRetrieval", - "RParisEasyI2IMultiChoice", - "RParisMediumI2IMultiChoice", - "RParisHardI2IMultiChoice", + "RParisEasyI2IRetrieval", + "RParisMediumI2IRetrieval", + "RParisHardI2IRetrieval", "SciMMIRI2TRetrieval", "SciMMIRT2IRetrieval", "SketchyI2IRetrieval", @@ -1620,20 +1606,19 @@ "Food101ZeroShot", "OxfordPetsZeroShot", "StanfordCarsZeroShot", - # Any2TextMutipleChoice + # Any2AnyMultipleChoice + "BLINKIT2IMultiChoice", "CVBenchCount", "CVBenchRelation", "CVBenchDepth", "CVBenchDistance", - # Any2AnyMultipleChoice - "BLINKIT2IMultiChoice", - "ImageCoDeT2IMultiChoice", # ImageTextPairClassification "AROCocoOrder", "AROFlickrOrder", "AROVisualAttribution", "AROVisualRelation", "Winoground", + "ImageCoDe", # VisualSTS "STS13VisualSTS", "STS15VisualSTS", diff --git a/mteb/benchmarks/get_benchmark.py b/mteb/benchmarks/get_benchmark.py index 6a12fe4fe1..7b0bc33ac4 100644 --- a/mteb/benchmarks/get_benchmark.py +++ b/mteb/benchmarks/get_benchmark.py @@ -80,8 +80,13 @@ def get_benchmark( def get_benchmarks( - names: list[str] | None = None, + names: list[str] | None = None, display_on_leaderboard: bool | None = None ) -> list[Benchmark]: if names is None: names = list(BENCHMARK_REGISTRY.keys()) - return [get_benchmark(name) for name in names] + benchmarks = [get_benchmark(name) for name in names] + if display_on_leaderboard is not None: + benchmarks = [ + b for b in benchmarks if b.display_on_leaderboard is display_on_leaderboard + ] + return benchmarks diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordEasyI2IMultiChoice.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordEasyI2IMultiChoice.json deleted file mode 100644 index e26273628e..0000000000 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordEasyI2IMultiChoice.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "test": { - "number_of_characters": 0, - "num_samples": 5063, - "num_queries": 70, - "num_documents": 4993, - "min_document_length": 0, - "average_document_length": 0, - "max_document_length": 0, - "unique_documents": 0, - "min_document_image_width": 256, - "average_document_image_width": 256.0, - "max_document_image_width": 256, - "min_document_image_height": 256, - "average_document_image_height": 256.0, - "max_document_image_height": 256, - "num_document_images": 4993, - "min_query_length": 0, - "average_query_length": 0, - "max_query_length": 0, - "unique_queries": 0, - "num_query_images": 70, - "min_query_image_width": 256, - "average_query_image_width": 256.0, - "max_query_image_width": 256, - "min_query_image_height": 256, - "average_query_image_height": 256.0, - "max_query_image_height": 256, - "min_relevant_docs_per_query": 0, - "average_relevant_docs_per_query": 43.27142857142857, - "max_relevant_docs_per_query": 248, - "unique_relevant_docs": 4993 - } -} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordEasyI2IRetrieval.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordEasyI2IRetrieval.json index b3469b2b0b..e26273628e 100644 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordEasyI2IRetrieval.json +++ b/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordEasyI2IRetrieval.json @@ -1,22 +1,34 @@ { "test": { "number_of_characters": 0, - "num_samples": 584, - "num_queries": 68, - "num_documents": 516, + "num_samples": 5063, + "num_queries": 70, + "num_documents": 4993, "min_document_length": 0, "average_document_length": 0, "max_document_length": 0, "unique_documents": 0, - "num_document_images": 516, + "min_document_image_width": 256, + "average_document_image_width": 256.0, + "max_document_image_width": 256, + "min_document_image_height": 256, + "average_document_image_height": 256.0, + "max_document_image_height": 256, + "num_document_images": 4993, "min_query_length": 0, "average_query_length": 0, "max_query_length": 0, "unique_queries": 0, - "num_query_images": 68, - "min_relevant_docs_per_query": 1, - "average_relevant_docs_per_query": 44.544117647058826, + "num_query_images": 70, + "min_query_image_width": 256, + "average_query_image_width": 256.0, + "max_query_image_width": 256, + "min_query_image_height": 256, + "average_query_image_height": 256.0, + "max_query_image_height": 256, + "min_relevant_docs_per_query": 0, + "average_relevant_docs_per_query": 43.27142857142857, "max_relevant_docs_per_query": 248, - "unique_relevant_docs": 516 + "unique_relevant_docs": 4993 } -} +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordHardI2IMultiChoice.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordHardI2IMultiChoice.json deleted file mode 100644 index a6b9a21ac5..0000000000 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordHardI2IMultiChoice.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "test": { - "number_of_characters": 0, - "num_samples": 5063, - "num_queries": 70, - "num_documents": 4993, - "min_document_length": 0, - "average_document_length": 0, - "max_document_length": 0, - "unique_documents": 0, - "min_document_image_width": 256, - "average_document_image_width": 256.0, - "max_document_image_width": 256, - "min_document_image_height": 256, - "average_document_image_height": 256.0, - "max_document_image_height": 256, - "num_document_images": 4993, - "min_query_length": 0, - "average_query_length": 0, - "max_query_length": 0, - "unique_queries": 0, - "num_query_images": 70, - "min_query_image_width": 256, - "average_query_image_width": 256.0, - "max_query_image_width": 256, - "min_query_image_height": 256, - "average_query_image_height": 256.0, - "max_query_image_height": 256, - "min_relevant_docs_per_query": 1, - "average_relevant_docs_per_query": 35.67142857142857, - "max_relevant_docs_per_query": 284, - "unique_relevant_docs": 4993 - } -} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordHardI2IRetrieval.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordHardI2IRetrieval.json index 515fa3a5da..a6b9a21ac5 100644 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordHardI2IRetrieval.json +++ b/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordHardI2IRetrieval.json @@ -1,22 +1,34 @@ { "test": { "number_of_characters": 0, - "num_samples": 755, + "num_samples": 5063, "num_queries": 70, - "num_documents": 685, + "num_documents": 4993, "min_document_length": 0, "average_document_length": 0, "max_document_length": 0, "unique_documents": 0, - "num_document_images": 685, + "min_document_image_width": 256, + "average_document_image_width": 256.0, + "max_document_image_width": 256, + "min_document_image_height": 256, + "average_document_image_height": 256.0, + "max_document_image_height": 256, + "num_document_images": 4993, "min_query_length": 0, "average_query_length": 0, "max_query_length": 0, "unique_queries": 0, "num_query_images": 70, + "min_query_image_width": 256, + "average_query_image_width": 256.0, + "max_query_image_width": 256, + "min_query_image_height": 256, + "average_query_image_height": 256.0, + "max_query_image_height": 256, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 35.67142857142857, "max_relevant_docs_per_query": 284, - "unique_relevant_docs": 685 + "unique_relevant_docs": 4993 } -} +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordMediumI2IMultiChoice.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordMediumI2IMultiChoice.json deleted file mode 100644 index 333bbe786d..0000000000 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordMediumI2IMultiChoice.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "test": { - "number_of_characters": 0, - "num_samples": 5063, - "num_queries": 70, - "num_documents": 4993, - "min_document_length": 0, - "average_document_length": 0, - "max_document_length": 0, - "unique_documents": 0, - "min_document_image_width": 256, - "average_document_image_width": 256.0, - "max_document_image_width": 256, - "min_document_image_height": 256, - "average_document_image_height": 256.0, - "max_document_image_height": 256, - "num_document_images": 4993, - "min_query_length": 0, - "average_query_length": 0, - "max_query_length": 0, - "unique_queries": 0, - "num_query_images": 70, - "min_query_image_width": 256, - "average_query_image_width": 256.0, - "max_query_image_width": 256, - "min_query_image_height": 256, - "average_query_image_height": 256.0, - "max_query_image_height": 256, - "min_relevant_docs_per_query": 2, - "average_relevant_docs_per_query": 78.94285714285714, - "max_relevant_docs_per_query": 347, - "unique_relevant_docs": 4993 - } -} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordMediumI2IRetrieval.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordMediumI2IRetrieval.json index 8ca55933c4..333bbe786d 100644 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordMediumI2IRetrieval.json +++ b/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordMediumI2IRetrieval.json @@ -1,22 +1,34 @@ { "test": { "number_of_characters": 0, - "num_samples": 858, + "num_samples": 5063, "num_queries": 70, - "num_documents": 788, + "num_documents": 4993, "min_document_length": 0, "average_document_length": 0, "max_document_length": 0, "unique_documents": 0, - "num_document_images": 788, + "min_document_image_width": 256, + "average_document_image_width": 256.0, + "max_document_image_width": 256, + "min_document_image_height": 256, + "average_document_image_height": 256.0, + "max_document_image_height": 256, + "num_document_images": 4993, "min_query_length": 0, "average_query_length": 0, "max_query_length": 0, "unique_queries": 0, "num_query_images": 70, + "min_query_image_width": 256, + "average_query_image_width": 256.0, + "max_query_image_width": 256, + "min_query_image_height": 256, + "average_query_image_height": 256.0, + "max_query_image_height": 256, "min_relevant_docs_per_query": 2, "average_relevant_docs_per_query": 78.94285714285714, "max_relevant_docs_per_query": 347, - "unique_relevant_docs": 788 + "unique_relevant_docs": 4993 } -} +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisEasyI2IMultiChoice.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisEasyI2IMultiChoice.json deleted file mode 100644 index 5cf0e5ee74..0000000000 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisEasyI2IMultiChoice.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "test": { - "number_of_characters": 0, - "num_samples": 6392, - "num_queries": 70, - "num_documents": 6322, - "min_document_length": 0, - "average_document_length": 0, - "max_document_length": 0, - "unique_documents": 0, - "min_document_image_width": 256, - "average_document_image_width": 256.0, - "max_document_image_width": 256, - "min_document_image_height": 256, - "average_document_image_height": 256.0, - "max_document_image_height": 256, - "num_document_images": 6322, - "min_query_length": 0, - "average_query_length": 0, - "max_query_length": 0, - "unique_queries": 0, - "num_query_images": 70, - "min_query_image_width": 256, - "average_query_image_width": 256.0, - "max_query_image_width": 256, - "min_query_image_height": 256, - "average_query_image_height": 256.0, - "max_query_image_height": 256, - "min_relevant_docs_per_query": 2, - "average_relevant_docs_per_query": 98.2, - "max_relevant_docs_per_query": 199, - "unique_relevant_docs": 6322 - } -} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisEasyI2IRetrieval.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisEasyI2IRetrieval.json index b21a7cfdd2..5cf0e5ee74 100644 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisEasyI2IRetrieval.json +++ b/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisEasyI2IRetrieval.json @@ -1,22 +1,34 @@ { "test": { "number_of_characters": 0, - "num_samples": 1540, + "num_samples": 6392, "num_queries": 70, - "num_documents": 1470, + "num_documents": 6322, "min_document_length": 0, "average_document_length": 0, "max_document_length": 0, "unique_documents": 0, - "num_document_images": 1470, + "min_document_image_width": 256, + "average_document_image_width": 256.0, + "max_document_image_width": 256, + "min_document_image_height": 256, + "average_document_image_height": 256.0, + "max_document_image_height": 256, + "num_document_images": 6322, "min_query_length": 0, "average_query_length": 0, "max_query_length": 0, "unique_queries": 0, "num_query_images": 70, + "min_query_image_width": 256, + "average_query_image_width": 256.0, + "max_query_image_width": 256, + "min_query_image_height": 256, + "average_query_image_height": 256.0, + "max_query_image_height": 256, "min_relevant_docs_per_query": 2, "average_relevant_docs_per_query": 98.2, "max_relevant_docs_per_query": 199, - "unique_relevant_docs": 1470 + "unique_relevant_docs": 6322 } -} +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisHardI2IMultiChoice.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisHardI2IMultiChoice.json deleted file mode 100644 index 87f882d612..0000000000 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisHardI2IMultiChoice.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "test": { - "number_of_characters": 0, - "num_samples": 6392, - "num_queries": 70, - "num_documents": 6322, - "min_document_length": 0, - "average_document_length": 0, - "max_document_length": 0, - "unique_documents": 0, - "min_document_image_width": 256, - "average_document_image_width": 256.0, - "max_document_image_width": 256, - "min_document_image_height": 256, - "average_document_image_height": 256.0, - "max_document_image_height": 256, - "num_document_images": 6322, - "min_query_length": 0, - "average_query_length": 0, - "max_query_length": 0, - "unique_queries": 0, - "num_query_images": 70, - "min_query_image_width": 256, - "average_query_image_width": 256.0, - "max_query_image_width": 256, - "min_query_image_height": 256, - "average_query_image_height": 256.0, - "max_query_image_height": 256, - "min_relevant_docs_per_query": 34, - "average_relevant_docs_per_query": 147.85714285714286, - "max_relevant_docs_per_query": 556, - "unique_relevant_docs": 6322 - } -} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisHardI2IRetrieval.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisHardI2IRetrieval.json index a704a31bb2..87f882d612 100644 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisHardI2IRetrieval.json +++ b/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisHardI2IRetrieval.json @@ -1,22 +1,34 @@ { "test": { "number_of_characters": 0, - "num_samples": 2048, + "num_samples": 6392, "num_queries": 70, - "num_documents": 1978, + "num_documents": 6322, "min_document_length": 0, "average_document_length": 0, "max_document_length": 0, "unique_documents": 0, - "num_document_images": 1978, + "min_document_image_width": 256, + "average_document_image_width": 256.0, + "max_document_image_width": 256, + "min_document_image_height": 256, + "average_document_image_height": 256.0, + "max_document_image_height": 256, + "num_document_images": 6322, "min_query_length": 0, "average_query_length": 0, "max_query_length": 0, "unique_queries": 0, "num_query_images": 70, + "min_query_image_width": 256, + "average_query_image_width": 256.0, + "max_query_image_width": 256, + "min_query_image_height": 256, + "average_query_image_height": 256.0, + "max_query_image_height": 256, "min_relevant_docs_per_query": 34, "average_relevant_docs_per_query": 147.85714285714286, "max_relevant_docs_per_query": 556, - "unique_relevant_docs": 1978 + "unique_relevant_docs": 6322 } -} +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisMediumI2IMultiChoice.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisMediumI2IMultiChoice.json deleted file mode 100644 index 95f4f9b84a..0000000000 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisMediumI2IMultiChoice.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "test": { - "number_of_characters": 0, - "num_samples": 6392, - "num_queries": 70, - "num_documents": 6322, - "min_document_length": 0, - "average_document_length": 0, - "max_document_length": 0, - "unique_documents": 0, - "min_document_image_width": 256, - "average_document_image_width": 256.0, - "max_document_image_width": 256, - "min_document_image_height": 256, - "average_document_image_height": 256.0, - "max_document_image_height": 256, - "num_document_images": 6322, - "min_query_length": 0, - "average_query_length": 0, - "max_query_length": 0, - "unique_queries": 0, - "num_query_images": 70, - "min_query_image_width": 256, - "average_query_image_width": 256.0, - "max_query_image_width": 256, - "min_query_image_height": 256, - "average_query_image_height": 256.0, - "max_query_image_height": 256, - "min_relevant_docs_per_query": 76, - "average_relevant_docs_per_query": 246.05714285714285, - "max_relevant_docs_per_query": 636, - "unique_relevant_docs": 6322 - } -} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisMediumI2IRetrieval.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisMediumI2IRetrieval.json index 65473fb4ed..95f4f9b84a 100644 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisMediumI2IRetrieval.json +++ b/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisMediumI2IRetrieval.json @@ -1,22 +1,34 @@ { "test": { "number_of_characters": 0, - "num_samples": 2721, + "num_samples": 6392, "num_queries": 70, - "num_documents": 2651, + "num_documents": 6322, "min_document_length": 0, "average_document_length": 0, "max_document_length": 0, "unique_documents": 0, - "num_document_images": 2651, + "min_document_image_width": 256, + "average_document_image_width": 256.0, + "max_document_image_width": 256, + "min_document_image_height": 256, + "average_document_image_height": 256.0, + "max_document_image_height": 256, + "num_document_images": 6322, "min_query_length": 0, "average_query_length": 0, "max_query_length": 0, "unique_queries": 0, "num_query_images": 70, + "min_query_image_width": 256, + "average_query_image_width": 256.0, + "max_query_image_width": 256, + "min_query_image_height": 256, + "average_query_image_height": 256.0, + "max_query_image_height": 256, "min_relevant_docs_per_query": 76, "average_relevant_docs_per_query": 246.05714285714285, "max_relevant_docs_per_query": 636, - "unique_relevant_docs": 2651 + "unique_relevant_docs": 6322 } -} +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Compositionality/ImageCoDe.json b/mteb/descriptive_stats/Image/Compositionality/ImageCoDe.json new file mode 100644 index 0000000000..ea5633e5dd --- /dev/null +++ b/mteb/descriptive_stats/Image/Compositionality/ImageCoDe.json @@ -0,0 +1,11 @@ +{ + "test": { + "num_samples": 2302, + "num_images": 23020, + "num_texts": 2302, + "num_unique_texts": 2302, + "min_text_length": 1, + "average_text_length": 102.71807124239791, + "max_text_length": 350 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Compositionality/ImageCoDeT2IMultiChoice.json b/mteb/descriptive_stats/Image/Compositionality/ImageCoDeT2IMultiChoice.json deleted file mode 100644 index 4d36f88146..0000000000 --- a/mteb/descriptive_stats/Image/Compositionality/ImageCoDeT2IMultiChoice.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "test": { - "number_of_characters": 236457, - "num_samples": 25322, - "num_queries": 2302, - "num_documents": 23020, - "min_document_length": 0, - "average_document_length": 0, - "max_document_length": 0, - "unique_documents": 0, - "min_document_image_width": 256, - "average_document_image_width": 256.0, - "max_document_image_width": 256, - "min_document_image_height": 256, - "average_document_image_height": 256.0, - "max_document_image_height": 256, - "num_document_images": 23020, - "min_query_length": 1, - "average_query_length": 102.71807124239791, - "max_query_length": 350, - "unique_queries": 2302, - "num_query_images": 0, - "min_query_image_width": 0, - "average_query_image_width": 0, - "max_query_image_width": 0, - "min_query_image_height": 0, - "average_query_image_height": 0, - "max_query_image_height": 0, - "min_relevant_docs_per_query": 1, - "average_relevant_docs_per_query": 1.0, - "max_relevant_docs_per_query": 1, - "unique_relevant_docs": 10390 - } -} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/VisionCentric/CVBenchCount.json b/mteb/descriptive_stats/Image/VisionCentric/CVBenchCount.json deleted file mode 100644 index 5ea0f28c08..0000000000 --- a/mteb/descriptive_stats/Image/VisionCentric/CVBenchCount.json +++ /dev/null @@ -1,37 +0,0 @@ -{ - "test": { - "num_samples": 788, - "min_image_width": 200, - "average_image_width": 757.6789340101523, - "max_image_width": 2200, - "min_image_height": 181, - "average_image_height": 631.3147208121827, - "max_image_height": 2200, - "min_num_choices": 4, - "average_num_choices": 4.550761421319797, - "max_num_choices": 6, - "min_question_length": 30, - "average_question_length": 34.35406091370558, - "max_question_length": 45, - "answers": { - "2": { - "count": 169 - }, - "4": { - "count": 63 - }, - "3": { - "count": 167 - }, - "1": { - "count": 184 - }, - "0": { - "count": 182 - }, - "5": { - "count": 23 - } - } - } -} diff --git a/mteb/descriptive_stats/Image/VisionCentric/CVBenchDepth.json b/mteb/descriptive_stats/Image/VisionCentric/CVBenchDepth.json deleted file mode 100644 index cf2523ac25..0000000000 --- a/mteb/descriptive_stats/Image/VisionCentric/CVBenchDepth.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "test": { - "num_samples": 600, - "min_image_width": 561, - "average_image_width": 1090.9616666666666, - "max_image_width": 1600, - "min_image_height": 427, - "average_image_height": 715.985, - "max_image_height": 900, - "min_num_choices": 2, - "average_num_choices": 2.0, - "max_num_choices": 2, - "min_question_length": 130, - "average_question_length": 136.04333333333332, - "max_question_length": 147, - "answers": { - "0": { - "count": 300 - }, - "1": { - "count": 300 - } - } - } -} diff --git a/mteb/descriptive_stats/Image/VisionCentric/CVBenchDistance.json b/mteb/descriptive_stats/Image/VisionCentric/CVBenchDistance.json deleted file mode 100644 index 42a155d5c7..0000000000 --- a/mteb/descriptive_stats/Image/VisionCentric/CVBenchDistance.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "test": { - "num_samples": 600, - "min_image_width": 561, - "average_image_width": 1099.2883333333334, - "max_image_width": 1600, - "min_image_height": 427, - "average_image_height": 720.9983333333333, - "max_image_height": 900, - "min_num_choices": 2, - "average_num_choices": 2.0, - "max_num_choices": 2, - "min_question_length": 204, - "average_question_length": 212.40333333333334, - "max_question_length": 223, - "answers": { - "0": { - "count": 303 - }, - "1": { - "count": 297 - } - } - } -} diff --git a/mteb/descriptive_stats/Image/VisionCentric/CVBenchRelation.json b/mteb/descriptive_stats/Image/VisionCentric/CVBenchRelation.json deleted file mode 100644 index bf9c463e70..0000000000 --- a/mteb/descriptive_stats/Image/VisionCentric/CVBenchRelation.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "test": { - "num_samples": 650, - "min_image_width": 189, - "average_image_width": 546.3169230769231, - "max_image_width": 2200, - "min_image_height": 190, - "average_image_height": 448.4492307692308, - "max_image_height": 2200, - "min_num_choices": 2, - "average_num_choices": 2.0, - "max_num_choices": 2, - "min_question_length": 132, - "average_question_length": 181.45846153846153, - "max_question_length": 224, - "answers": { - "0": { - "count": 327 - }, - "1": { - "count": 323 - } - } - } -} diff --git a/mteb/descriptive_stats/Image/VisionCentric/BLINKIT2IMultiChoice.json b/mteb/descriptive_stats/Image/VisionCentricQA/BLINKIT2IMultiChoice.json similarity index 99% rename from mteb/descriptive_stats/Image/VisionCentric/BLINKIT2IMultiChoice.json rename to mteb/descriptive_stats/Image/VisionCentricQA/BLINKIT2IMultiChoice.json index a290475a94..ff7f1b0d82 100644 --- a/mteb/descriptive_stats/Image/VisionCentric/BLINKIT2IMultiChoice.json +++ b/mteb/descriptive_stats/Image/VisionCentricQA/BLINKIT2IMultiChoice.json @@ -31,4 +31,4 @@ "max_relevant_docs_per_query": 1, "unique_relevant_docs": 804 } -} \ No newline at end of file +} diff --git a/mteb/descriptive_stats/Image/VisionCentric/BLINKIT2TMultiChoice.json b/mteb/descriptive_stats/Image/VisionCentricQA/BLINKIT2TMultiChoice.json similarity index 99% rename from mteb/descriptive_stats/Image/VisionCentric/BLINKIT2TMultiChoice.json rename to mteb/descriptive_stats/Image/VisionCentricQA/BLINKIT2TMultiChoice.json index 747d9238f1..9fd8347d13 100644 --- a/mteb/descriptive_stats/Image/VisionCentric/BLINKIT2TMultiChoice.json +++ b/mteb/descriptive_stats/Image/VisionCentricQA/BLINKIT2TMultiChoice.json @@ -31,4 +31,4 @@ "max_relevant_docs_per_query": 1, "unique_relevant_docs": 20 } -} \ No newline at end of file +} diff --git a/mteb/descriptive_stats/Image/VisionCentricQA/CVBenchCount.json b/mteb/descriptive_stats/Image/VisionCentricQA/CVBenchCount.json new file mode 100644 index 0000000000..f8927f7f02 --- /dev/null +++ b/mteb/descriptive_stats/Image/VisionCentricQA/CVBenchCount.json @@ -0,0 +1,34 @@ +{ + "test": { + "number_of_characters": 27095, + "num_samples": 805, + "num_queries": 788, + "num_documents": 17, + "min_document_length": 1, + "average_document_length": 1.411764705882353, + "max_document_length": 2, + "unique_documents": 17, + "min_document_image_width": 0, + "average_document_image_width": 0, + "max_document_image_width": 0, + "min_document_image_height": 0, + "average_document_image_height": 0, + "max_document_image_height": 0, + "num_document_images": 0, + "min_query_length": 30, + "average_query_length": 34.35406091370558, + "max_query_length": 45, + "unique_queries": 197, + "num_query_images": 788, + "min_query_image_width": 181, + "average_query_image_width": 631.3147208121827, + "max_query_image_width": 2200, + "min_query_image_height": 200, + "average_query_image_height": 757.6789340101523, + "max_query_image_height": 2200, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 17 + } +} diff --git a/mteb/descriptive_stats/Image/VisionCentricQA/CVBenchDepth.json b/mteb/descriptive_stats/Image/VisionCentricQA/CVBenchDepth.json new file mode 100644 index 0000000000..1c60700905 --- /dev/null +++ b/mteb/descriptive_stats/Image/VisionCentricQA/CVBenchDepth.json @@ -0,0 +1,34 @@ +{ + "test": { + "number_of_characters": 82092, + "num_samples": 669, + "num_queries": 600, + "num_documents": 69, + "min_document_length": 3, + "average_document_length": 6.753623188405797, + "max_document_length": 17, + "unique_documents": 69, + "min_document_image_width": 0, + "average_document_image_width": 0, + "max_document_image_width": 0, + "min_document_image_height": 0, + "average_document_image_height": 0, + "max_document_image_height": 0, + "num_document_images": 0, + "min_query_length": 130, + "average_query_length": 136.04333333333332, + "max_query_length": 147, + "unique_queries": 279, + "num_query_images": 600, + "min_query_image_width": 427, + "average_query_image_width": 715.985, + "max_query_image_width": 900, + "min_query_image_height": 561, + "average_query_image_height": 1090.9616666666666, + "max_query_image_height": 1600, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 69 + } +} diff --git a/mteb/descriptive_stats/Image/VisionCentricQA/CVBenchDistance.json b/mteb/descriptive_stats/Image/VisionCentricQA/CVBenchDistance.json new file mode 100644 index 0000000000..276b0c92d0 --- /dev/null +++ b/mteb/descriptive_stats/Image/VisionCentricQA/CVBenchDistance.json @@ -0,0 +1,34 @@ +{ + "test": { + "number_of_characters": 127804, + "num_samples": 656, + "num_queries": 600, + "num_documents": 56, + "min_document_length": 3, + "average_document_length": 6.464285714285714, + "max_document_length": 12, + "unique_documents": 56, + "min_document_image_width": 0, + "average_document_image_width": 0, + "max_document_image_width": 0, + "min_document_image_height": 0, + "average_document_image_height": 0, + "max_document_image_height": 0, + "num_document_images": 0, + "min_query_length": 204, + "average_query_length": 212.40333333333334, + "max_query_length": 223, + "unique_queries": 381, + "num_query_images": 600, + "min_query_image_width": 427, + "average_query_image_width": 720.9983333333333, + "max_query_image_width": 900, + "min_query_image_height": 561, + "average_query_image_height": 1099.2883333333334, + "max_query_image_height": 1600, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 56 + } +} diff --git a/mteb/descriptive_stats/Image/VisionCentricQA/CVBenchRelation.json b/mteb/descriptive_stats/Image/VisionCentricQA/CVBenchRelation.json new file mode 100644 index 0000000000..74c7bc08fb --- /dev/null +++ b/mteb/descriptive_stats/Image/VisionCentricQA/CVBenchRelation.json @@ -0,0 +1,34 @@ +{ + "test": { + "number_of_characters": 117967, + "num_samples": 654, + "num_queries": 650, + "num_documents": 4, + "min_document_length": 4, + "average_document_length": 4.75, + "max_document_length": 5, + "unique_documents": 4, + "min_document_image_width": 0, + "average_document_image_width": 0, + "max_document_image_width": 0, + "min_document_image_height": 0, + "average_document_image_height": 0, + "max_document_image_height": 0, + "num_document_images": 0, + "min_query_length": 132, + "average_query_length": 181.45846153846153, + "max_query_length": 224, + "unique_queries": 580, + "num_query_images": 650, + "min_query_image_width": 190, + "average_query_image_width": 448.4492307692308, + "max_query_image_width": 2200, + "min_query_image_height": 189, + "average_query_image_height": 546.3169230769231, + "max_query_image_height": 2200, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 4 + } +} diff --git a/mteb/evaluation/evaluators/Image/Any2TextMultipleChoiceEvaluator.py b/mteb/evaluation/evaluators/Image/Any2TextMultipleChoiceEvaluator.py deleted file mode 100644 index d84c239c6c..0000000000 --- a/mteb/evaluation/evaluators/Image/Any2TextMultipleChoiceEvaluator.py +++ /dev/null @@ -1,102 +0,0 @@ -from __future__ import annotations - -import logging -from typing import Any - -import numpy as np -import torch -from sklearn.metrics import accuracy_score -from sklearn.metrics.pairwise import cosine_similarity -from tqdm import tqdm - -from mteb.create_dataloaders import ( - create_dataloader_from_texts, - create_image_dataloader, -) -from mteb.encoder_interface import Encoder, EncoderWithSimilarity -from mteb.evaluation.evaluators.Evaluator import Evaluator - -logger = logging.getLogger(__name__) - - -class Any2TextMultipleChoiceEvaluator(Evaluator): - """Evaluate a model based on the similarity of queries (can be interleaved) and candidate answers. - The goal is to find the correct text in multiple candidates that - forms the correct answer of the interleaved query. - - Args: - query_modalities: the modality of queries; supports image and text or either at the moment, - query_column_names: column names of queries; should align with query modalities. - label_column_name: column name of labels; - choices_column_names: column name of candidate choices; - """ - - def __init__( - self, - dataset, - query_modalities: str | list[str], - query_column_names: dict, - label_column_name: str, - choices_column_name: str, - task_name: str | None = None, - limit: int | None = None, - **kwargs, - ): - super().__init__(**kwargs) - if limit: - dataset = dataset.select(range(limit)) - self.dataset = dataset - self.query_modalities = query_modalities - self.query_column_names = query_column_names - self.label_column_name = label_column_name - self.choices_column_name = choices_column_name - self.task_name = task_name - - def __call__( - self, - model: Encoder | EncoderWithSimilarity, - encode_kwargs: dict[str, Any] = {}, - ): - if "batch_size" not in encode_kwargs: - encode_kwargs["batch_size"] = 64 - - choices = self.dataset[self.choices_column_name] - answers = self.dataset[self.label_column_name] - label_list = list({x for n in choices for x in n}) - label_embeddings = model.encode( - create_dataloader_from_texts( - label_list, - ), - task_name=self.task_name, - batch_size=encode_kwargs["batch_size"], - ) - label_embedding_dict = {} - for label, embedding in zip(label_list, label_embeddings): - label_embedding_dict[label] = embedding - - dataset = create_image_dataloader( - self.dataset.remove_columns( - [self.choices_column_name, self.label_column_name] - ), - batch_size=encode_kwargs["batch_size"], - ) - - query_embeddings = model.encode( - dataset, - task_name=self.task_name, - batch_size=encode_kwargs["batch_size"], - ) - - # note that answers are the indeces - predictions = [] - for q_embedding, choice in tqdm(zip(query_embeddings, choices)): - choice_embeddings = torch.vstack( - [label_embedding_dict[c] for c in choice] - ) # (choice_size, embedding_dim) - q_embedding = q_embedding[np.newaxis, :] - cos_sim = cosine_similarity(q_embedding, choice_embeddings) - predictions.append(np.argmax(cos_sim)) - - metrics = {} - metrics["accuracy"] = accuracy_score(predictions, answers) - return metrics diff --git a/mteb/evaluation/evaluators/Image/__init__.py b/mteb/evaluation/evaluators/Image/__init__.py index 2405efa0a1..8bfdb451ad 100644 --- a/mteb/evaluation/evaluators/Image/__init__.py +++ b/mteb/evaluation/evaluators/Image/__init__.py @@ -2,7 +2,6 @@ from .Any2AnyMultiChoiceEvaluator import Any2AnyMultiChoiceEvaluator from .Any2AnyRetrievalEvaluator import Any2AnyRetrievalEvaluator -from .Any2TextMultipleChoiceEvaluator import Any2TextMultipleChoiceEvaluator from .ClassificationEvaluator import ( ImagekNNClassificationEvaluator, ImagekNNClassificationEvaluatorPytorch, @@ -16,7 +15,6 @@ __all__ = [ "Any2AnyMultiChoiceEvaluator", "Any2AnyRetrievalEvaluator", - "Any2TextMultipleChoiceEvaluator", "ImagekNNClassificationEvaluator", "ImagelogRegClassificationEvaluator", "ImagekNNClassificationEvaluatorPytorch", diff --git a/mteb/evaluation/evaluators/__init__.py b/mteb/evaluation/evaluators/__init__.py index 4405751cbd..cbd2e00e2c 100644 --- a/mteb/evaluation/evaluators/__init__.py +++ b/mteb/evaluation/evaluators/__init__.py @@ -11,7 +11,6 @@ from .Image import ( Any2AnyMultiChoiceEvaluator, Any2AnyRetrievalEvaluator, - Any2TextMultipleChoiceEvaluator, ImageClusteringEvaluator, ImagekNNClassificationEvaluator, ImagekNNClassificationEvaluatorPytorch, @@ -40,12 +39,10 @@ "BitextMiningEvaluator", "PairClassificationEvaluator", "kNNClassificationEvaluator", - "kNNClassificationEvaluatorPytorch", "logRegClassificationEvaluator", "dot_distance", "Any2AnyMultiChoiceEvaluator", "Any2AnyRetrievalEvaluator", - "Any2TextMultipleChoiceEvaluator", "ImagekNNClassificationEvaluator", "ImagelogRegClassificationEvaluator", "ImagekNNClassificationEvaluatorPytorch", diff --git a/mteb/leaderboard/__init__.py b/mteb/leaderboard/__init__.py index 1dc3560a64..1db3fa2545 100644 --- a/mteb/leaderboard/__init__.py +++ b/mteb/leaderboard/__init__.py @@ -1,5 +1,5 @@ from __future__ import annotations -from mteb.leaderboard.app import demo +from mteb.leaderboard.app import get_leaderboard_app -__all__ = ["demo"] +__all__ = ["get_leaderboard_app"] diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 143d925bcd..e3833b5ce3 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -21,15 +21,8 @@ from mteb.custom_validators import MODALITIES from mteb.languages import ISO_TO_LANGUAGE from mteb.leaderboard.figures import performance_size_plot, radar_chart -from mteb.leaderboard.table import scores_to_tables - -logging.getLogger("mteb.load_results.task_results").setLevel( - logging.WARNING -) # Warnings related to task split -logging.getLogger("mteb.models.overview").setLevel( - logging.WARNING -) # Warning related to model metadata (fetch_from_hf=False) -warnings.filterwarnings("ignore", message="Couldn't get scores for .* due to .*") +from mteb.leaderboard.table import create_tables + logger = logging.getLogger(__name__) acknowledgment_md = """ @@ -210,661 +203,688 @@ def filter_models( return list(models_to_keep) -logger.info("Loading all benchmark results") -all_results = load_results() - -benchmarks = sorted(mteb.get_benchmarks(), key=lambda x: x.name) -all_benchmark_results = { - benchmark.name: benchmark.load_results(base_results=all_results).join_revisions() - for benchmark in benchmarks -} -default_benchmark = mteb.get_benchmark(DEFAULT_BENCHMARK_NAME) -default_results = all_benchmark_results[default_benchmark.name] -logger.info("Benchmark results loaded") - -default_scores = default_results.get_scores(format="long") -all_models = list({entry["model_name"] for entry in default_scores}) -filtered_models = filter_models( - all_models, - default_results.task_names, - availability=None, - compatibility=[], - instructions=None, - model_size=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), - zero_shot_setting="allow_all", -) - -summary_table, per_task_table = scores_to_tables( - [entry for entry in default_scores if entry["model_name"] in filtered_models] -) - -benchmark_select = gr.Dropdown( - [bench.name for bench in benchmarks], - value=default_benchmark.name, - label="Prebuilt Benchmarks", - info="Select one of our expert-selected benchmarks from MTEB publications.", -) -lang_select = gr.Dropdown( - ISO_TO_LANGUAGE, - value=sorted(default_results.languages), - allow_custom_value=True, - multiselect=True, - label="Language", - info="Select languages to include.", -) -type_select = gr.Dropdown( - sorted(get_args(TASK_TYPE)), - value=sorted(default_results.task_types), - multiselect=True, - label="Task Type", - info="Select task types to include.", -) -domain_select = gr.Dropdown( - sorted(get_args(TASK_DOMAIN)), - value=sorted(default_results.domains), - multiselect=True, - label="Domain", - info="Select domains to include.", -) -task_select = gr.Dropdown( - sorted(all_results.task_names), - value=sorted(default_results.task_names), - allow_custom_value=True, - multiselect=True, - label="Task", - info="Select specific tasks to include", -) -modality_select = gr.Dropdown( - sorted(get_args(MODALITIES)), - value=sorted(default_results.modalities), - multiselect=True, - label="Modality", - info="Select modalities to include.", -) - -head = """ - -""" +def get_leaderboard_app() -> gr.Blocks: + logger.info("Loading all benchmark results") + all_results = load_results() -with gr.Blocks(fill_width=True, theme=gr.themes.Base(), head=head) as demo: - gr.Markdown( - """ - ## Embedding Leaderboard + benchmarks = sorted( + mteb.get_benchmarks(display_on_leaderboard=True), key=lambda x: x.name + ) + all_benchmark_results = { + benchmark.name: benchmark.load_results( + base_results=all_results + ).join_revisions() + for benchmark in benchmarks + } + default_benchmark = mteb.get_benchmark(DEFAULT_BENCHMARK_NAME) + default_results = all_benchmark_results[default_benchmark.name] + logger.info("Benchmark results loaded") + + default_scores = default_results.get_scores(format="long") + all_models = list({entry["model_name"] for entry in default_scores}) + filtered_models = filter_models( + all_models, + default_results.task_names, + availability=None, + compatibility=[], + instructions=None, + model_size=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), + zero_shot_setting="allow_all", + ) - This leaderboard compares 100+ text and image (soon) embedding models across 1000+ languages. We refer to the publication of each selectable benchmark for details on metrics, languages, tasks, and task types. Anyone is welcome [to add a model](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md), [add benchmarks](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_benchmark.md), [help us improve zero-shot annotations](https://github.com/embeddings-benchmark/mteb/blob/06489abca007261c7e6b11f36d4844c5ed5efdcb/mteb/models/bge_models.py#L91) or [propose other changes to the leaderboard](https://github.com/embeddings-benchmark/mteb/tree/main/mteb/leaderboard) 🤗 Also, check out [MTEB Arena](https://huggingface.co/spaces/mteb/arena) ⚔️ + summary_table, per_task_table = create_tables( + [entry for entry in default_scores if entry["model_name"] in filtered_models] + ) - > Looking for the previous MTEB leaderboard? We have made it available [here](https://huggingface.co/spaces/mteb/leaderboard_legacy) but it will no longer be updated. - """ + benchmark_select = gr.Dropdown( + [bench.name for bench in benchmarks], + value=default_benchmark.name, + label="Prebuilt Benchmarks", + info="Select one of our expert-selected benchmarks from MTEB publications.", + ) + lang_select = gr.Dropdown( + ISO_TO_LANGUAGE, + value=sorted(default_results.languages), + allow_custom_value=True, + multiselect=True, + label="Language", + info="Select languages to include.", + ) + type_select = gr.Dropdown( + sorted(get_args(TASK_TYPE)), + value=sorted(default_results.task_types), + multiselect=True, + label="Task Type", + info="Select task types to include.", + ) + domain_select = gr.Dropdown( + sorted(get_args(TASK_DOMAIN)), + value=sorted(default_results.domains), + multiselect=True, + label="Domain", + info="Select domains to include.", + ) + task_select = gr.Dropdown( + sorted(all_results.task_names), + value=sorted(default_results.task_names), + allow_custom_value=True, + multiselect=True, + label="Task", + info="Select specific tasks to include", + ) + modality_select = gr.Dropdown( + sorted(get_args(MODALITIES)), + value=sorted(default_results.modalities), + multiselect=True, + label="Modality", + info="Select modalities to include.", ) - with gr.Row(): - with gr.Column(scale=5): - gr.Markdown( - """ - ### Benchmarks - Select one of the hand-curated benchmarks from our publications and modify them using one of the following filters to fit your needs. + head = """ + + """ + + with gr.Blocks(fill_width=True, theme=gr.themes.Base(), head=head) as demo: + gr.Markdown( """ - ) - with gr.Group(): - with gr.Row(elem_classes="overflow-y-scroll max-h-80"): - with gr.Column(): - benchmark_select.render() - with gr.Accordion("Select Languages", open=False): - lang_select.render() - with gr.Accordion("Select Task Types", open=False): - type_select.render() - with gr.Accordion("Select Domains", open=False): - domain_select.render() - with gr.Accordion("Select Modalities", open=False): - modality_select.render() - with gr.Accordion("Add and remove tasks:", open=False): - task_select.render() - with gr.Column(scale=8): - gr.Markdown( - """ - ### Model Selection - Select models to rank based on an assortment of criteria. - """, - ) - with gr.Group(): - with gr.Row(): - searchbar = gr.Textbox( - label="Search Models", - info="Press Enter to search.\nSearch models by name (RegEx sensitive. Separate queries with `|`)", - interactive=True, - ) - compatibility = gr.CheckboxGroup( - [ - ( - "Should be sentence-transformers compatible", - "Sentence Transformers", - ) - ], - value=[], - label="Compatibility", - interactive=True, - ) - with gr.Row(elem_classes=""): - with gr.Column(): - availability = gr.Radio( - [ - ("Only Open", True), - ("Only Proprietary", False), - ("Both", None), - ], - value=None, - label="Availability", - interactive=True, - ) - instructions = gr.Radio( - [ - ("Only Instruction-tuned", True), - ("Only non-instruction", False), - ("Both", None), - ], - value=None, - label="Instructions", + ## Embedding Leaderboard + + This leaderboard compares 100+ text and image (soon) embedding models across 1000+ languages. We refer to the publication of each selectable benchmark for details on metrics, languages, tasks, and task types. Anyone is welcome [to add a model](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md), [add benchmarks](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_benchmark.md), [help us improve zero-shot annotations](https://github.com/embeddings-benchmark/mteb/blob/06489abca007261c7e6b11f36d4844c5ed5efdcb/mteb/models/bge_models.py#L91) or [propose other changes to the leaderboard](https://github.com/embeddings-benchmark/mteb/tree/main/mteb/leaderboard) 🤗 Also, check out [MTEB Arena](https://huggingface.co/spaces/mteb/arena) ⚔️ + + > Looking for the previous MTEB leaderboard? We have made it available [here](https://huggingface.co/spaces/mteb/leaderboard_legacy) but it will no longer be updated. + """ + ) + + with gr.Row(): + with gr.Column(scale=5): + gr.Markdown( + "### Benchmarks\n" + "Select one of the hand-curated benchmarks from our publications and modify them using one of the following filters to fit your needs." + ) + with gr.Group(): + with gr.Row(elem_classes="overflow-y-scroll max-h-80"): + with gr.Column(): + benchmark_select.render() + with gr.Accordion("Select Languages", open=False): + lang_select.render() + with gr.Accordion("Select Task Types", open=False): + type_select.render() + with gr.Accordion("Select Domains", open=False): + domain_select.render() + with gr.Accordion("Select Modalities", open=False): + modality_select.render() + with gr.Accordion("Add and remove tasks:", open=False): + task_select.render() + with gr.Column(scale=8): + gr.Markdown( + """ + ### Model Selection + Select models to rank based on an assortment of criteria. + """, + ) + with gr.Group(): + with gr.Row(): + searchbar = gr.Textbox( + label="Search Models", + info="Press Enter to search.\nSearch models by name (RegEx sensitive. Separate queries with `|`)", interactive=True, ) - with gr.Column(): - zero_shot = gr.Radio( + compatibility = gr.CheckboxGroup( [ ( - "Only Zero-shot", - "only_zero_shot", - ), - ("Remove Unknown", "remove_unknown"), - ("Allow All", "allow_all"), + "Should be sentence-transformers compatible", + "Sentence Transformers", + ) ], - value="allow_all", - label="Zero-shot", + value=[], + label="Compatibility", interactive=True, ) - model_size = RangeSlider( - minimum=MIN_MODEL_SIZE, - maximum=MAX_MODEL_SIZE, - value=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), - label="Model Size (#M Parameters)", - ) - scores = gr.State(default_scores) - models = gr.State(filtered_models) - with gr.Row(): - with gr.Column(): - description = gr.Markdown( - update_description, - inputs=[benchmark_select, lang_select, type_select, domain_select], + with gr.Row(elem_classes=""): + with gr.Column(): + availability = gr.Radio( + [ + ("Only Open", True), + ("Only Proprietary", False), + ("Both", None), + ], + value=None, + label="Availability", + interactive=True, + ) + instructions = gr.Radio( + [ + ("Only Instruction-tuned", True), + ("Only non-instruction", False), + ("Both", None), + ], + value=None, + label="Instructions", + interactive=True, + ) + with gr.Column(): + zero_shot = gr.Radio( + [ + ( + "Only Zero-shot", + "only_zero_shot", + ), + ("Remove Unknown", "remove_unknown"), + ("Allow All", "allow_all"), + ], + value="allow_all", + label="Zero-shot", + interactive=True, + ) + model_size = RangeSlider( + minimum=MIN_MODEL_SIZE, + maximum=MAX_MODEL_SIZE, + value=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), + label="Model Size (#M Parameters)", + ) + scores = gr.State(default_scores) + models = gr.State(filtered_models) + with gr.Row(): + with gr.Column(): + description = gr.Markdown( # noqa: F841 + update_description, + inputs=[benchmark_select, lang_select, type_select, domain_select], + ) + citation = gr.Markdown(update_citation, inputs=[benchmark_select]) # noqa: F841 + with gr.Accordion("Share this benchmark:", open=False): + gr.Markdown(produce_benchmark_link, inputs=[benchmark_select]) + with gr.Column(): + with gr.Tab("Performance per Model Size"): + plot = gr.Plot(performance_size_plot, inputs=[summary_table]) # noqa: F841 + gr.Markdown( + "*We only display models that have been run on all tasks in the benchmark*" + ) + with gr.Tab("Performance per Task Type (Radar Chart)"): + radar_plot = gr.Plot(radar_chart, inputs=[summary_table]) # noqa: F841 + gr.Markdown( + "*We only display models that have been run on all task types in the benchmark*" + ) + with gr.Tab("Summary"): + summary_table.render() + download_summary = gr.DownloadButton("Download Table") + download_summary.click( + download_table, inputs=[summary_table], outputs=[download_summary] ) - citation = gr.Markdown(update_citation, inputs=[benchmark_select]) - with gr.Accordion("Share this benchmark:", open=False): - gr.Markdown(produce_benchmark_link, inputs=[benchmark_select]) - with gr.Column(): - with gr.Tab("Performance per Model Size"): - plot = gr.Plot(performance_size_plot, inputs=[summary_table]) + + with gr.Accordion( + "What do aggregate measures (Rank(Borda), Mean(Task), etc.) mean?", + open=False, + ): + gr.Markdown( + """ + **Rank(borda)** is computed based on the [borda count](https://en.wikipedia.org/wiki/Borda_count), where each task is treated as a preference voter, which gives votes on the models per their relative performance on the task. The best model obtains the highest number of votes. The model with the highest number of votes across tasks obtains the highest rank. The Borda rank tends to prefer models that perform well broadly across tasks. However, given that it is a rank it can be unclear if the two models perform similarly. + + **Mean(Task)**: This is a naïve average computed across all the tasks within the benchmark. This score is simple to understand and is continuous as opposed to the Borda rank. However, the mean can overvalue tasks with higher variance in its scores. + + **Mean(TaskType)**: This is a weighted average across different task categories, such as classification or retrieval. It is computed by first computing the average by task category and then computing the average on each category. Similar to the Mean(Task) this measure is continuous and tends to overvalue tasks with higher variance. This score also prefers models that perform well across all task categories. + """ + ) + with gr.Accordion( + "What does zero-shot mean?", + open=False, + ): gr.Markdown( - "*We only display models that have been run on all tasks in the benchmark*" + """ + A model is considered zero-shot if it is not trained on any splits of the datasets used to derive the tasks. + The percentages in the table indicate what portion of the benchmark can be considered out-of-distribution for a given model. + 100% means the model has not been trained on any of the datasets in a given benchmark, and therefore the benchmark score can be interpreted as the model's overall generalization performance, + while 50% means the model has been finetuned on half of the tasks in the benchmark, thereby indicating that the benchmark results should be interpreted with a pinch of salt. + This definition creates a few edge cases. For instance, multiple models are typically trained on Wikipedia title and body pairs, but we do not define this as leakage on, e.g., “WikipediaRetrievalMultilingual” and “WikiClusteringP2P” as these datasets are not based on title-body pairs. + Distilled, further fine-tunes, or in other ways, derivative models inherit the datasets of their parent models. + Based on community feedback and research findings, this definition may change in the future. Please open a PR if you notice any mistakes or want to help us refine annotations, see [GitHub](https://github.com/embeddings-benchmark/mteb/blob/06489abca007261c7e6b11f36d4844c5ed5efdcb/mteb/models/bge_models.py#L91). + """ ) - with gr.Tab("Performance per Task Type (Radar Chart)"): - radar_plot = gr.Plot(radar_chart, inputs=[summary_table]) + with gr.Accordion( + "What do the other columns mean?", + open=False, + ): gr.Markdown( - "*We only display models that have been run on all task types in the benchmark*" + """ + - **Number of Parameters**: This is the total number of parameters in the model including embedding parameters. A higher value means the model requires more CPU/GPU memory to run; thus, less is generally desirable. + - **Embedding Dimension**: This is the vector dimension of the embeddings that the model produces. When saving embeddings to disk, a higher dimension will require more space, thus less is usually desirable. + - **Max tokens**: This refers to how many tokens (=word pieces) the model can process. Generally, a larger value is desirable. + - **Zero-shot**: This indicates if the model is zero-shot on the benchmark. For more information on zero-shot see the info box above. + """ ) - with gr.Tab("Summary"): - summary_table.render() - download_summary = gr.DownloadButton("Download Table") - download_summary.click( - download_table, inputs=[summary_table], outputs=[download_summary] - ) - - with gr.Accordion( - "What do aggregate measures (Rank(Borda), Mean(Task), etc.) mean?", - open=False, - ): - gr.Markdown( + with gr.Accordion( + "Why is a model missing or not showing up?", + open=False, + ): + gr.Markdown( + """ + Possible reasons why a model may not show up in the leaderboard: + + - **Filter Setting**: It is being filtered out with your current filter. By default, we do not show models that are not zero-shot on the benchmark. + You can change this setting in the model selection panel. + - **Missing Results**: The model may not have been run on the tasks in the benchmark. We only display models that have been run on at least one task + in the benchmark. For visualizations that require the mean across all tasks, we only display models that have been run on all tasks in the benchmark. + You can see existing results in the [results repository](https://github.com/embeddings-benchmark/results). This is also where new results are added via PR. + - **Missing Metadata**: Currently, we only show models for which we have metadata in [mteb](https://github.com/embeddings-benchmark/mteb). + You can follow this guide on how to add a [model](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md) and + see existing implementations [here](https://github.com/embeddings-benchmark/mteb/tree/main/mteb/models). """ - **Rank(borda)** is computed based on the [borda count](https://en.wikipedia.org/wiki/Borda_count), where each task is treated as a preference voter, which gives votes on the models per their relative performance on the task. The best model obtains the highest number of votes. The model with the highest number of votes across tasks obtains the highest rank. The Borda rank tends to prefer models that perform well broadly across tasks. However, given that it is a rank it can be unclear if the two models perform similarly. + ) + with gr.Tab("Performance per task"): + per_task_table.render() + download_per_task = gr.DownloadButton("Download Table") + download_per_task.click( + download_table, inputs=[per_task_table], outputs=[download_per_task] + ) + with gr.Tab("Task information"): + task_info_table = gr.DataFrame(update_task_info, inputs=[task_select]) # noqa: F841 - **Mean(Task)**: This is a naïve average computed across all the tasks within the benchmark. This score is simple to understand and is continuous as opposed to the Borda rank. However, the mean can overvalue tasks with higher variance in its scores. + # This sets the benchmark from the URL query parameters + demo.load(set_benchmark_on_load, inputs=[], outputs=[benchmark_select]) - **Mean(TaskType)**: This is a weighted average across different task categories, such as classification or retrieval. It is computed by first computing the average by task category and then computing the average on each category. Similar to the Mean(Task) this measure is continuous and tends to overvalue tasks with higher variance. This score also prefers models that perform well across all task categories. - """ - ) - with gr.Accordion( - "What does zero-shot mean?", - open=False, - ): - gr.Markdown( - """ -A model is considered zero-shot if it is not trained on any splits of the datasets used to derive the tasks. -The percentages in the table indicate what portion of the benchmark can be considered out-of-distribution for a given model. -100% means the model has not been trained on any of the datasets in a given benchmark, and therefore the benchmark score can be interpreted as the model's overall generalization performance, -while 50% means the model has been finetuned on half of the tasks in the benchmark, thereby indicating that the benchmark results should be interpreted with a pinch of salt. -This definition creates a few edge cases. For instance, multiple models are typically trained on Wikipedia title and body pairs, but we do not define this as leakage on, e.g., “WikipediaRetrievalMultilingual” and “WikiClusteringP2P” as these datasets are not based on title-body pairs. -Distilled, further fine-tunes, or in other ways, derivative models inherit the datasets of their parent models. -Based on community feedback and research findings, this definition may change in the future. Please open a PR if you notice any mistakes or want to help us refine annotations, see [GitHub](https://github.com/embeddings-benchmark/mteb/blob/06489abca007261c7e6b11f36d4844c5ed5efdcb/mteb/models/bge_models.py#L91). - """ - ) - with gr.Accordion( - "What do the other columns mean?", - open=False, - ): - gr.Markdown( - """ -- **Number of Parameters**: This is the total number of parameters in the model including embedding parameters. A higher value means the model requires more CPU/GPU memory to run; thus, less is generally desirable. -- **Embedding Dimension**: This is the vector dimension of the embeddings that the model produces. When saving embeddings to disk, a higher dimension will require more space, thus less is usually desirable. -- **Max tokens**: This refers to how many tokens (=word pieces) the model can process. Generally, a larger value is desirable. -- **Zero-shot**: This indicates if the model is zero-shot on the benchmark. For more information on zero-shot see the info box above. - """ + @cachetools.cached( + cache={}, + key=lambda benchmark_name: hash(benchmark_name), + ) + def on_benchmark_select(benchmark_name): + start_time = time.time() + benchmark = mteb.get_benchmark(benchmark_name) + languages = [task.languages for task in benchmark.tasks if task.languages] + languages = set(itertools.chain.from_iterable(languages)) + languages = sorted(languages) + domains = [ + task.metadata.domains + for task in benchmark.tasks + if task.metadata.domains + ] + domains = set(itertools.chain.from_iterable(domains)) + types = { + task.metadata.type for task in benchmark.tasks if task.metadata.type + } + modalities = set() + for task in benchmark.tasks: + modalities.update(task.metadata.modalities) + languages, domains, types, modalities = ( + sorted(languages), + sorted(domains), + sorted(types), + sorted(modalities), ) - with gr.Accordion( - "Why is a model missing or not showing up?", - open=False, - ): - gr.Markdown( - """ -Possible reasons why a model may not show up in the leaderboard: - -- **Filter Setting**: It is being filtered out with your current filter. By default, we do not show models that are not zero-shot on the benchmark. -You can change this setting in the model selection panel. -- **Missing Results**: The model may not have been run on the tasks in the benchmark. We only display models that have been run on at least one task -in the benchmark. For visualizations that require the mean across all tasks, we only display models that have been run on all tasks in the benchmark. -You can see existing results in the [results repository](https://github.com/embeddings-benchmark/results). This is also where new results are added via PR. -- **Missing Metadata**: Currently, we only show models for which we have metadata in [mteb](https://github.com/embeddings-benchmark/mteb). -You can follow this guide on how to add a [model](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md) and -see existing implementations [here](https://github.com/embeddings-benchmark/mteb/tree/main/mteb/models). - """ + elapsed = time.time() - start_time + benchmark_results = all_benchmark_results[benchmark_name] + scores = benchmark_results.get_scores(format="long") + logger.info(f"on_benchmark_select callback: {elapsed}s") + return ( + languages, + domains, + types, + modalities, + sorted([task.metadata.name for task in benchmark.tasks]), + scores, ) - with gr.Tab("Performance per task"): - per_task_table.render() - download_per_task = gr.DownloadButton("Download Table") - download_per_task.click( - download_table, inputs=[per_task_table], outputs=[download_per_task] - ) - with gr.Tab("Task information"): - task_info_table = gr.DataFrame(update_task_info, inputs=[task_select]) - # This sets the benchmark from the URL query parameters - demo.load(set_benchmark_on_load, inputs=[], outputs=[benchmark_select]) + benchmark_select.change( + on_benchmark_select, + inputs=[benchmark_select], + outputs=[ + lang_select, + domain_select, + type_select, + modality_select, + task_select, + scores, + ], + ) - @cachetools.cached( - cache={}, - key=lambda benchmark_name: hash(benchmark_name), - ) - def on_benchmark_select(benchmark_name): - start_time = time.time() - benchmark = mteb.get_benchmark(benchmark_name) - languages = [task.languages for task in benchmark.tasks if task.languages] - languages = set(itertools.chain.from_iterable(languages)) - languages = sorted(languages) - domains = [ - task.metadata.domains for task in benchmark.tasks if task.metadata.domains - ] - domains = set(itertools.chain.from_iterable(domains)) - types = {task.metadata.type for task in benchmark.tasks if task.metadata.type} - modalities = set() - for task in benchmark.tasks: - modalities.update(task.metadata.modalities) - languages, domains, types, modalities = ( - sorted(languages), - sorted(domains), - sorted(types), - sorted(modalities), + @cachetools.cached( + cache={}, + key=lambda benchmark_name, languages: hash( + (hash(benchmark_name), hash(tuple(languages))) + ), ) - elapsed = time.time() - start_time - benchmark_results = all_benchmark_results[benchmark_name] - scores = benchmark_results.get_scores(format="long") - logger.info(f"on_benchmark_select callback: {elapsed}s") - return ( - languages, - domains, - types, - modalities, - sorted([task.metadata.name for task in benchmark.tasks]), - scores, + def update_scores_on_lang_change(benchmark_name, languages): + start_time = time.time() + benchmark_results = all_benchmark_results[benchmark_name] + scores = benchmark_results.get_scores(languages=languages, format="long") + elapsed = time.time() - start_time + logger.info(f"update_scores callback: {elapsed}s") + return scores + + lang_select.input( + update_scores_on_lang_change, + inputs=[benchmark_select, lang_select], + outputs=[scores], ) - benchmark_select.change( - on_benchmark_select, - inputs=[benchmark_select], - outputs=[ - lang_select, - domain_select, - type_select, - modality_select, - task_select, - scores, - ], - ) - - @cachetools.cached( - cache={}, - key=lambda benchmark_name, languages: hash( - (hash(benchmark_name), hash(tuple(languages))) - ), - ) - def update_scores_on_lang_change(benchmark_name, languages): - start_time = time.time() - benchmark_results = all_benchmark_results[benchmark_name] - scores = benchmark_results.get_scores(languages=languages, format="long") - elapsed = time.time() - start_time - logger.info(f"update_scores callback: {elapsed}s") - return scores - - lang_select.input( - update_scores_on_lang_change, - inputs=[benchmark_select, lang_select], - outputs=[scores], - ) - - @cachetools.cached( - cache={}, - key=lambda benchmark_name, - type_select, - domain_select, - lang_select, - modality_select: hash( - ( - hash(benchmark_name), - hash(tuple(type_select)), - hash(tuple(domain_select)), - hash(tuple(lang_select)), - hash(tuple(modality_select)), - ) - ), - ) - def update_task_list( - benchmark_name, type_select, domain_select, lang_select, modality_select - ): - start_time = time.time() - tasks_to_keep = [] - for task in mteb.get_benchmark(benchmark_name).tasks: - if task.metadata.type not in type_select: - continue - if not (set(task.metadata.domains or []) & set(domain_select)): - continue - if not (set(task.languages or []) & set(lang_select)): - continue - if not (set(task.metadata.modalities or []) & set(modality_select)): - continue - tasks_to_keep.append(task.metadata.name) - elapsed = time.time() - start_time - logger.info(f"update_task_list callback: {elapsed}s") - return sorted(tasks_to_keep) - - type_select.input( - update_task_list, - inputs=[ - benchmark_select, - type_select, - domain_select, - lang_select, - modality_select, - ], - outputs=[task_select], - ) - domain_select.input( - update_task_list, - inputs=[ - benchmark_select, + @cachetools.cached( + cache={}, + key=lambda benchmark_name, type_select, domain_select, lang_select, - modality_select, - ], - outputs=[task_select], - ) - lang_select.input( - update_task_list, - inputs=[ - benchmark_select, - type_select, - domain_select, - lang_select, - modality_select, - ], - outputs=[task_select], - ) - modality_select.input( - update_task_list, - inputs=[ - benchmark_select, - type_select, - domain_select, - lang_select, - modality_select, - ], - outputs=[task_select], - ) + modality_select: hash( + ( + hash(benchmark_name), + hash(tuple(type_select)), + hash(tuple(domain_select)), + hash(tuple(lang_select)), + hash(tuple(modality_select)), + ) + ), + ) + def update_task_list( + benchmark_name, type_select, domain_select, lang_select, modality_select + ): + start_time = time.time() + tasks_to_keep = [] + for task in mteb.get_benchmark(benchmark_name).tasks: + if task.metadata.type not in type_select: + continue + if not (set(task.metadata.domains or []) & set(domain_select)): + continue + if not (set(task.languages or []) & set(lang_select)): + continue + if not (set(task.metadata.modalities or []) & set(modality_select)): + continue + tasks_to_keep.append(task.metadata.name) + elapsed = time.time() - start_time + logger.info(f"update_task_list callback: {elapsed}s") + return sorted(tasks_to_keep) + + type_select.input( + update_task_list, + inputs=[ + benchmark_select, + type_select, + domain_select, + lang_select, + modality_select, + ], + outputs=[task_select], + ) + domain_select.input( + update_task_list, + inputs=[ + benchmark_select, + type_select, + domain_select, + lang_select, + modality_select, + ], + outputs=[task_select], + ) + lang_select.input( + update_task_list, + inputs=[ + benchmark_select, + type_select, + domain_select, + lang_select, + modality_select, + ], + outputs=[task_select], + ) + modality_select.input( + update_task_list, + inputs=[ + benchmark_select, + type_select, + domain_select, + lang_select, + modality_select, + ], + outputs=[task_select], + ) - @cachetools.cached( - cache={}, - key=lambda scores, - tasks, - availability, - compatibility, - instructions, - model_size, - zero_shot: hash( - ( - id(scores), - hash(tuple(tasks)), - hash(availability), - hash(tuple(compatibility)), - hash(instructions), - hash(model_size), - hash(zero_shot), - ) - ), - ) - def update_models( - scores: list[dict], - tasks: list[str], - availability: bool | None, - compatibility: list[str], - instructions: bool | None, - model_size: tuple[int, int], - zero_shot: Literal["allow_all", "remove_unknown", "only_zero_shot"], - ): - start_time = time.time() - model_names = list({entry["model_name"] for entry in scores}) - filtered_models = filter_models( - model_names, + @cachetools.cached( + cache={}, + key=lambda scores, tasks, availability, compatibility, instructions, model_size, - zero_shot_setting=zero_shot, + zero_shot: hash( + ( + id(scores), + hash(tuple(tasks)), + hash(availability), + hash(tuple(compatibility)), + hash(instructions), + hash(model_size), + hash(zero_shot), + ) + ), ) - elapsed = time.time() - start_time - if model_names == filtered_models: - # This indicates that the models should not be filtered - return None - logger.info(f"update_models callback: {elapsed}s") - return sorted(filtered_models) - - scores.change( - update_models, - inputs=[ - scores, - task_select, - availability, - compatibility, - instructions, - model_size, - zero_shot, - ], - outputs=[models], - ) - task_select.change( - update_models, - inputs=[ - scores, - task_select, - availability, - compatibility, - instructions, - model_size, - zero_shot, - ], - outputs=[models], - ) - availability.input( - update_models, - inputs=[ - scores, - task_select, - availability, - compatibility, - instructions, - model_size, - zero_shot, - ], - outputs=[models], - ) - compatibility.input( - update_models, - inputs=[ - scores, - task_select, - availability, - compatibility, - instructions, - model_size, - zero_shot, - ], - outputs=[models], - ) - instructions.input( - update_models, - inputs=[ - scores, - task_select, - availability, - compatibility, - instructions, - model_size, - zero_shot, - ], - outputs=[models], - ) - model_size.change( - update_models, - inputs=[ - scores, - task_select, - availability, - compatibility, - instructions, - model_size, - zero_shot, - ], - outputs=[models], - ) - zero_shot.change( - update_models, - inputs=[ - scores, - task_select, - availability, - compatibility, - instructions, - model_size, - zero_shot, - ], - outputs=[models], - ) - - @cachetools.cached( - cache={}, - key=lambda scores, search_query, tasks, models_to_keep, benchmark_name: hash( - ( - id(scores), - hash(search_query), - hash(tuple(tasks)), - id(models_to_keep), - hash(benchmark_name), + def update_models( + scores: list[dict], + tasks: list[str], + availability: bool | None, + compatibility: list[str], + instructions: bool | None, + model_size: tuple[int, int], + zero_shot: Literal["allow_all", "remove_unknown", "only_zero_shot"], + ): + start_time = time.time() + model_names = list({entry["model_name"] for entry in scores}) + filtered_models = filter_models( + model_names, + tasks, + availability, + compatibility, + instructions, + model_size, + zero_shot_setting=zero_shot, ) - ), - ) - def update_tables( - scores, - search_query: str, - tasks, - models_to_keep, - benchmark_name: str, - ): - start_time = time.time() - tasks = set(tasks) - benchmark = mteb.get_benchmark(benchmark_name) - benchmark_tasks = {task.metadata.name for task in benchmark.tasks} - if (benchmark_tasks != tasks) or (models_to_keep is not None): - filtered_scores = [] - for entry in scores: - if entry["task_name"] not in tasks: - continue - if (models_to_keep is not None) and ( - entry["model_name"] not in models_to_keep - ): - continue - filtered_scores.append(entry) - else: - filtered_scores = scores - summary, per_task = scores_to_tables(filtered_scores, search_query) - elapsed = time.time() - start_time - logger.info(f"update_tables callback: {elapsed}s") - return summary, per_task - - task_select.change( - update_tables, - inputs=[scores, searchbar, task_select, models, benchmark_select], - outputs=[summary_table, per_task_table], - ) - scores.change( - update_tables, - inputs=[scores, searchbar, task_select, models, benchmark_select], - outputs=[summary_table, per_task_table], - ) - models.change( - update_tables, - inputs=[scores, searchbar, task_select, models, benchmark_select], - outputs=[summary_table, per_task_table], - ) - searchbar.submit( - update_tables, - inputs=[scores, searchbar, task_select, models, benchmark_select], - outputs=[summary_table, per_task_table], - ) + elapsed = time.time() - start_time + if model_names == filtered_models: + # This indicates that the models should not be filtered + return None + logger.info(f"update_models callback: {elapsed}s") + return sorted(filtered_models) + + scores.change( + update_models, + inputs=[ + scores, + task_select, + availability, + compatibility, + instructions, + model_size, + zero_shot, + ], + outputs=[models], + ) + task_select.change( + update_models, + inputs=[ + scores, + task_select, + availability, + compatibility, + instructions, + model_size, + zero_shot, + ], + outputs=[models], + ) + availability.input( + update_models, + inputs=[ + scores, + task_select, + availability, + compatibility, + instructions, + model_size, + zero_shot, + ], + outputs=[models], + ) + compatibility.input( + update_models, + inputs=[ + scores, + task_select, + availability, + compatibility, + instructions, + model_size, + zero_shot, + ], + outputs=[models], + ) + instructions.input( + update_models, + inputs=[ + scores, + task_select, + availability, + compatibility, + instructions, + model_size, + zero_shot, + ], + outputs=[models], + ) + model_size.change( + update_models, + inputs=[ + scores, + task_select, + availability, + compatibility, + instructions, + model_size, + zero_shot, + ], + outputs=[models], + ) + zero_shot.change( + update_models, + inputs=[ + scores, + task_select, + availability, + compatibility, + instructions, + model_size, + zero_shot, + ], + outputs=[models], + ) - gr.Markdown(acknowledgment_md, elem_id="ack_markdown") - - -# Prerun on all benchmarks, so that results of callbacks get cached -for benchmark in benchmarks: - ( - bench_languages, - bench_domains, - bench_types, - bench_modalities, - bench_tasks, - bench_scores, - ) = on_benchmark_select(benchmark.name) - filtered_models = update_models( - bench_scores, - bench_tasks, - availability=None, - compatibility=[], - instructions=None, - model_size=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), - zero_shot="allow_all", - ) - # We have to call this both on the filtered and unfiltered task because the callbacks - # also gets called twice for some reason - update_tables(bench_scores, "", bench_tasks, filtered_models, benchmark.name) - filtered_tasks = update_task_list( - benchmark.name, bench_types, bench_domains, bench_languages, bench_modalities - ) - update_tables(bench_scores, "", filtered_tasks, filtered_models, benchmark.name) + @cachetools.cached( + cache={}, + key=lambda scores, + search_query, + tasks, + models_to_keep, + benchmark_name: hash( + ( + id(scores), + hash(search_query), + hash(tuple(tasks)), + id(models_to_keep), + hash(benchmark_name), + ) + ), + ) + def update_tables( + scores, + search_query: str, + tasks, + models_to_keep, + benchmark_name: str, + ): + start_time = time.time() + tasks = set(tasks) + benchmark = mteb.get_benchmark(benchmark_name) + benchmark_tasks = {task.metadata.name for task in benchmark.tasks} + if (benchmark_tasks != tasks) or (models_to_keep is not None): + filtered_scores = [] + for entry in scores: + if entry["task_name"] not in tasks: + continue + if (models_to_keep is not None) and ( + entry["model_name"] not in models_to_keep + ): + continue + filtered_scores.append(entry) + else: + filtered_scores = scores + summary, per_task = create_tables(filtered_scores, search_query) + elapsed = time.time() - start_time + logger.info(f"update_tables callback: {elapsed}s") + return summary, per_task + + task_select.change( + update_tables, + inputs=[scores, searchbar, task_select, models, benchmark_select], + outputs=[summary_table, per_task_table], + ) + scores.change( + update_tables, + inputs=[scores, searchbar, task_select, models, benchmark_select], + outputs=[summary_table, per_task_table], + ) + models.change( + update_tables, + inputs=[scores, searchbar, task_select, models, benchmark_select], + outputs=[summary_table, per_task_table], + ) + searchbar.submit( + update_tables, + inputs=[scores, searchbar, task_select, models, benchmark_select], + outputs=[summary_table, per_task_table], + ) + + gr.Markdown(acknowledgment_md, elem_id="ack_markdown") + + # Prerun on all benchmarks, so that results of callbacks get cached + for benchmark in benchmarks: + ( + bench_languages, + bench_domains, + bench_types, + bench_modalities, + bench_tasks, + bench_scores, + ) = on_benchmark_select(benchmark.name) + filtered_models = update_models( + bench_scores, + bench_tasks, + availability=None, + compatibility=[], + instructions=None, + model_size=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), + zero_shot="allow_all", + ) + # We have to call this both on the filtered and unfiltered task because the callbacks + # also gets called twice for some reason + update_tables(bench_scores, "", bench_tasks, filtered_models, benchmark.name) + filtered_tasks = update_task_list( + benchmark.name, + bench_types, + bench_domains, + bench_languages, + bench_modalities, + ) + update_tables(bench_scores, "", filtered_tasks, filtered_models, benchmark.name) + return demo if __name__ == "__main__": - demo.launch(share=True) + logging.getLogger("mteb.load_results.task_results").setLevel( + logging.ERROR + ) # Warnings related to task split + logging.getLogger("mteb.model_meta").setLevel( + logging.ERROR + ) # Warning related to model metadata (fetch_from_hf=False) + logging.getLogger("mteb.load_results.benchmark_results").setLevel( + logging.ERROR + ) # Warning related to model metadata (fetch_from_hf=False) + warnings.filterwarnings("ignore", message="Couldn't get scores for .* due to .*") + + app = get_leaderboard_app() + app.launch(server_name="0.0.0.0", server_port=7860) diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index 74c96ea9f2..b848406ba5 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -5,8 +5,10 @@ from collections import defaultdict import gradio as gr +import matplotlib.pyplot as plt import numpy as np import pandas as pd +from matplotlib.colors import LinearSegmentedColormap from pandas.api.types import is_numeric_dtype from mteb.models.overview import get_model_meta @@ -98,9 +100,18 @@ def format_zero_shot(zero_shot_percentage: int): return f"{zero_shot_percentage:.0f}%" -def scores_to_tables( - scores_long: list[dict], search_query: str | None = None -) -> tuple[gr.DataFrame, gr.DataFrame]: +def create_light_green_cmap(): + cmap = plt.cm.get_cmap("Greens") + num_colors = 256 + half_colors = np.linspace(0, 0.5, num_colors) + half_cmap = [cmap(val) for val in half_colors] + light_green_cmap = LinearSegmentedColormap.from_list( + "LightGreens", half_cmap, N=256 + ) + return light_green_cmap + + +def scores_to_tables(scores_long: list[dict], search_query: str | None = None): if not scores_long: no_results_frame = pd.DataFrame( {"No results": ["You can try relaxing your criteria"]} @@ -157,6 +168,13 @@ def scores_to_tables( "Number of Parameters", model_metas.map(lambda m: format_n_parameters(m.n_parameters)), ) + joint_table.insert( + 1, + "Memory Usage (MB)", + model_metas.map( + lambda m: str(int(m.memory_usage_mb)) if m.memory_usage_mb else "Unknown" + ), + ) tasks = get_tasks(tasks=list(data["task_name"].unique())) joint_table.insert( 1, "Zero-shot", model_metas.map(lambda m: m.zero_shot_percentage(tasks)) @@ -196,32 +214,78 @@ def scores_to_tables( # setting model name column to markdown column_types[1] = "markdown" score_columns = ["Mean (Task)", "Mean (TaskType)", *mean_per_type.columns] - numeric_zero_shot = joint_table["Zero-shot"].copy().replace(-1, np.nan) + + return joint_table, per_task, score_columns, column_types + + +def apply_styling( + joint_table: pd.DataFrame, + per_task: pd.DataFrame, + score_columns: list[str], + column_types: list[str], +) -> tuple[gr.DataFrame, gr.DataFrame]: + excluded_columns = [ + "Rank (Borda)", + "Model", + "Number of Parameters", + "Embedding Dimensions", + "Max Tokens", + "Memory Usage (MB)", + ] + gradient_columns = [ + col for col in joint_table.columns if col not in excluded_columns + ] + light_green_cmap = create_light_green_cmap() + numeric_data = joint_table.copy() + numeric_data["Zero-shot"] = numeric_data["Zero-shot"].replace(-1, np.nan) joint_table["Zero-shot"] = joint_table["Zero-shot"].apply(format_zero_shot) joint_table[score_columns] = joint_table[score_columns].map(format_scores) - joint_table_style = ( - joint_table.style.format( - { - **dict.fromkeys(score_columns, "{:.2f}"), - "Rank (Borda)": "{:.0f}", - }, - na_rep="", - ) - .highlight_min("Rank (Borda)", props="font-weight: bold") - .highlight_max(subset=score_columns, props="font-weight: bold") - .background_gradient( - cmap="RdYlGn", - subset=["Zero-shot"], - vmin=50, - vmax=100, - gmap=numeric_zero_shot, - ) + joint_table_style = joint_table.style.format( + { + **{column: "{:.2f}" for column in score_columns}, + "Rank (Borda)": "{:.0f}", + }, + na_rep="", ) + joint_table_style = joint_table_style.highlight_min( + "Rank (Borda)", props="font-weight: bold" + ).highlight_max(subset=score_columns, props="font-weight: bold") + + # Apply background gradients for each selected column + for col in gradient_columns: + if col in joint_table.columns: + mask = numeric_data[col].notna() + if col != "Zero-shot": + gmap_values = numeric_data[col] * 100 + cmap = light_green_cmap + joint_table_style = joint_table_style.background_gradient( + cmap=cmap, + subset=pd.IndexSlice[mask, col], + gmap=gmap_values.loc[mask], + ) + else: + gmap_values = numeric_data[col] + cmap = "RdYlGn" + joint_table_style = joint_table_style.background_gradient( + cmap=cmap, + subset=pd.IndexSlice[mask, col], + vmin=50, + vmax=100, + gmap=gmap_values.loc[mask], + ) task_score_columns = per_task.select_dtypes("number").columns per_task[task_score_columns] *= 100 per_task_style = per_task.style.format( "{:.2f}", subset=task_score_columns, na_rep="" ).highlight_max(subset=task_score_columns, props="font-weight: bold") + for col in task_score_columns: + if col != "Model": + mask = per_task[col].notna() + per_task_style = per_task_style.background_gradient( + cmap=light_green_cmap, + subset=pd.IndexSlice[mask, col], + gmap=per_task[col].loc[mask], + ) return ( gr.DataFrame( joint_table_style, @@ -231,3 +295,18 @@ def scores_to_tables( ), gr.DataFrame(per_task_style, interactive=False, pinned_columns=1), ) + + +def create_tables( + scores_long: list[dict], search_query: str | None = None +) -> tuple[gr.DataFrame, gr.DataFrame]: + result = scores_to_tables(scores_long, search_query) + # dataframe with No Results is returned, so no need to apply styling + if len(result) == 2: + joint_table, per_task = result + return joint_table, per_task + joint_table, per_task, score_columns, column_types = result + summary_table, per_task_table = apply_styling( + joint_table, per_task, score_columns, column_types + ) + return summary_table, per_task_table diff --git a/mteb/load_results/task_results.py b/mteb/load_results/task_results.py index 5be5ec11e5..44cee1da57 100644 --- a/mteb/load_results/task_results.py +++ b/mteb/load_results/task_results.py @@ -378,17 +378,17 @@ def _convert_from_before_v1_11_0(cls, data: dict) -> TaskResult: for split, split_score in scores.items(): for hf_subset, hf_subset_scores in split_score.items(): for name, prev_name in [ - (ScoringFunction.COSINE, "cos_sim"), - (ScoringFunction.MANHATTAN, "manhattan"), - (ScoringFunction.EUCLIDEAN, "euclidean"), - (ScoringFunction.DOT_PRODUCT, "dot"), + (ScoringFunction.COSINE.value, "cos_sim"), + (ScoringFunction.MANHATTAN.value, "manhattan"), + (ScoringFunction.EUCLIDEAN.value, "euclidean"), + (ScoringFunction.DOT_PRODUCT.value, "dot"), ("max", "max"), - (ScoringFunction.MODEL_SPECIFIC, "similarity"), + (ScoringFunction.MODEL_SPECIFIC.value, "similarity"), ]: prev_name_scores = hf_subset_scores.pop(prev_name, None) if prev_name_scores is not None: for k, v in prev_name_scores.items(): - hf_subset_scores[f"{name.value}_{k}"] = v + hf_subset_scores[f"{name}_{k}"] = v if "main_score" not in hf_subset_scores: if main_score in hf_subset_scores: diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py index b4ecbc0d69..02e81dce52 100644 --- a/mteb/models/blip2_models.py +++ b/mteb/models/blip2_models.py @@ -11,17 +11,17 @@ from mteb.encoder_interface import BatchedInput, PromptType from mteb.model_meta import ModelMeta from mteb.models.wrapper import Wrapper +from mteb.requires_package import requires_package def blip2_loader(**kwargs): - try: # a temporal fix for the dependency issues. - from lavis.models.blip2_models.blip2_image_text_matching import ( - Blip2ITM, - ) - except ImportError: - raise ImportError( - "Please install `pip install mteb[blip2]` to use BLIP-2 models." - ) + model_name = kwargs.get("model_name", "BLIP-2") + requires_package( + blip2_loader, "salesforce-lavis", model_name, "pip install 'mteb[blip2]'" + ) + from lavis.models.blip2_models.blip2_image_text_matching import ( + Blip2ITM, + ) class BLIP2ModelWrapper(Wrapper): def __init__( diff --git a/mteb/models/bm25.py b/mteb/models/bm25.py index 6cb6133d4e..e31c605690 100644 --- a/mteb/models/bm25.py +++ b/mteb/models/bm25.py @@ -5,18 +5,15 @@ from mteb.model_meta import ModelMeta from mteb.models.wrapper import Wrapper +from mteb.requires_package import requires_package logger = logging.getLogger(__name__) -def bm25_loader(name: str, **kwargs): - try: - import bm25s - import Stemmer - except ImportError: - raise ImportError( - "bm25s or PyStemmer is not installed. Please install it with `pip install mteb[bm25s]`." - ) +def bm25_loader(model_name, **kwargs): + requires_package(bm25_loader, "bm25s", model_name, "pip install mteb[bm25s]") + import bm25s + import Stemmer class BM25Search(Wrapper): """BM25 search""" diff --git a/mteb/models/cache_wrapper.py b/mteb/models/cache_wrapper.py index 61abccb9da..4fde7c4f49 100644 --- a/mteb/models/cache_wrapper.py +++ b/mteb/models/cache_wrapper.py @@ -12,9 +12,6 @@ from mteb.encoder_interface import Encoder from mteb.models.wrapper import Wrapper -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" -) logger = logging.getLogger(__name__) diff --git a/mteb/models/cohere_v.py b/mteb/models/cohere_v.py index 023d2d59dc..0b5d0172bd 100644 --- a/mteb/models/cohere_v.py +++ b/mteb/models/cohere_v.py @@ -13,14 +13,15 @@ from mteb.encoder_interface import BatchedInput, PromptType from mteb.model_meta import ModelMeta, ScoringFunction -from mteb.requires_package import requires_image_dependencies +from mteb.requires_package import requires_image_dependencies, requires_package def cohere_v_loader(**kwargs): - try: - import cohere # type: ignore - except ImportError: - raise ImportError("To use cohere models, please run `pip install cohere`.") + model_name = kwargs.get("model_name", "Cohere") + requires_package( + cohere_v_loader, "cohere", model_name, "pip install 'mteb[cohere]'" + ) + import cohere class CohereMultiModalModelWrapper: def __init__( diff --git a/mteb/models/colbert_models.py b/mteb/models/colbert_models.py index 22955a3dde..15085d8253 100644 --- a/mteb/models/colbert_models.py +++ b/mteb/models/colbert_models.py @@ -10,6 +10,7 @@ from mteb.encoder_interface import BatchedInput, PromptType from mteb.model_meta import ModelMeta, ScoringFunction from mteb.models.wrapper import Wrapper +from mteb.requires_package import requires_package logger = logging.getLogger(__name__) @@ -33,12 +34,8 @@ def __init__( and finally to the specific prompt type. **kwargs: Additional arguments to pass to the model. """ - try: - from pylate import models as colbert_model - except ModuleNotFoundError as e: - raise ModuleNotFoundError( - "To use the ColBERT models `pylate` is required. Please install it with `pip install mteb[pylate]`." - ) from e + requires_package(self, "pylate", model_name, "pip install mteb[pylate]") + from pylate import models as colbert_model self.model_name = model_name self.model = colbert_model.ColBERT(self.model_name, revision=revision, **kwargs) diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py index 9b8b4bfa6a..0f586a9b3a 100644 --- a/mteb/models/e5_instruct.py +++ b/mteb/models/e5_instruct.py @@ -43,7 +43,7 @@ open_weights=True, revision="baa7be480a7de1539afce709c8f13f833a510e0a", release_date=E5_PAPER_RELEASE_DATE, - framework=["GritLM", "PyTorch"], + framework=["GritLM", "PyTorch", "Sentence Transformers"], similarity_fn_name=ScoringFunction.COSINE, use_instructions=True, reference="https://huggingface.co/intfloat/multilingual-e5-large-instruct", @@ -81,7 +81,7 @@ open_weights=True, revision="07163b72af1488142a360786df853f237b1a3ca1", release_date=E5_PAPER_RELEASE_DATE, - framework=["GritLM", "PyTorch"], + framework=["GritLM", "PyTorch", "Sentence Transformers"], similarity_fn_name=ScoringFunction.COSINE, use_instructions=True, reference="https://huggingface.co/intfloat/e5-mistral-7b-instruct", @@ -135,7 +135,7 @@ open_weights=True, public_training_data=None, public_training_code=None, - framework=["PyTorch"], + framework=["PyTorch", "Sentence Transformers", "GritLM"], reference="https://huggingface.co/zeta-alpha-ai/Zeta-Alpha-E5-Mistral", similarity_fn_name=ScoringFunction.COSINE, use_instructions=True, diff --git a/mteb/models/gme_v_models.py b/mteb/models/gme_v_models.py index 36430478fc..adc6f09d74 100644 --- a/mteb/models/gme_v_models.py +++ b/mteb/models/gme_v_models.py @@ -15,12 +15,8 @@ from mteb.model_meta import ModelMeta, ScoringFunction from mteb.models.wrapper import Wrapper -logging.basicConfig(level=logging.WARNING) logger = logging.getLogger(__name__) -HF_GME_QWEN2VL_2B = "Alibaba-NLP/gme-Qwen2-VL-2B-Instruct" -HF_GME_QWEN2VL_7B = "Alibaba-NLP/gme-Qwen2-VL-7B-Instruct" - class Encoder(torch.nn.Module): def __init__( @@ -132,7 +128,7 @@ def embed( class GmeQwen2VL(Wrapper): def __init__( self, - model_name: str = HF_GME_QWEN2VL_2B, + model_name: str, model_path: str | None = None, device: str = "cuda" if torch.cuda.is_available() else "cpu", min_image_tokens=4, @@ -340,7 +336,7 @@ def fetch_image( gme_qwen2vl_2b = ModelMeta( loader=GmeQwen2VL, - name=HF_GME_QWEN2VL_2B, + name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", languages=["eng_Latn", "cmn-Hans"], open_weights=True, revision="ce765ae71b8cdb208203cd8fb64a170b1b84293a", @@ -351,7 +347,7 @@ def fetch_image( embed_dim=1536, license="apache-2.0", max_tokens=32768, - reference="https://huggingface.co/" + HF_GME_QWEN2VL_2B, + reference="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", similarity_fn_name=ScoringFunction.COSINE, framework=["PyTorch"], use_instructions=True, @@ -362,7 +358,7 @@ def fetch_image( gme_qwen2vl_7b = ModelMeta( loader=GmeQwen2VL, - name=HF_GME_QWEN2VL_7B, + name="Alibaba-NLP/gme-Qwen2-VL-7B-Instruct", languages=["eng_Latn", "cmn-Hans"], open_weights=True, revision="477027a6480f8630363be77751f169cc3434b673", @@ -373,7 +369,7 @@ def fetch_image( embed_dim=3584, license="apache-2.0", max_tokens=32768, - reference="https://huggingface.co/" + HF_GME_QWEN2VL_2B, + reference="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct", similarity_fn_name=ScoringFunction.COSINE, framework=["PyTorch"], use_instructions=True, diff --git a/mteb/models/google_models.py b/mteb/models/google_models.py index 940bbf7744..3e182fd644 100644 --- a/mteb/models/google_models.py +++ b/mteb/models/google_models.py @@ -9,6 +9,7 @@ from mteb.encoder_interface import BatchedInput, PromptType from mteb.model_meta import ModelMeta, ScoringFunction from mteb.models.wrapper import Wrapper +from mteb.requires_package import requires_package MULTILINGUAL_EVALUATED_LANGUAGES = [ "arb_Arab", @@ -74,12 +75,11 @@ def _embed( """Embeds texts with a pre-trained, foundational model. From https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings#generative-ai-get-text-embedding-python_vertex_ai_sdk """ - try: - from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel - except ImportError: - raise ImportError( - "The `vertexai` package is required to run the google API, please install it using `pip install vertexai`" - ) + requires_package( + self, "vertexai", self.model_name, "pip install 'mteb[vertexai]'" + ) + from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel + model = TextEmbeddingModel.from_pretrained(self.model_name) if titles: # Allow title-only embeddings by replacing text with a space diff --git a/mteb/models/instruct_wrapper.py b/mteb/models/instruct_wrapper.py index 02440e08b9..597a35a441 100644 --- a/mteb/models/instruct_wrapper.py +++ b/mteb/models/instruct_wrapper.py @@ -10,6 +10,7 @@ from mteb.encoder_interface import BatchedInput, PromptType from mteb.models.wrapper import Wrapper +from mteb.requires_package import requires_package logger = logging.getLogger(__name__) @@ -20,12 +21,10 @@ def instruct_wrapper( instruction_template: str | Callable[[str], str] | None = None, **kwargs, ): - try: - from gritlm import GritLM - except ImportError: - raise ImportError( - f"Please install `pip install mteb[gritlm]` to use {model_name_or_path}." - ) + requires_package( + instruct_wrapper, "gritlm", model_name_or_path, "pip install 'mteb[gritlm]'" + ) + from gritlm import GritLM class InstructWrapper(GritLM, Wrapper): def __init__( diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index f8fade4380..0b13790e34 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -11,6 +11,7 @@ from mteb.encoder_interface import BatchedInput, PromptType from mteb.model_meta import ModelMeta, ScoringFunction from mteb.models.sentence_transformer_wrapper import SentenceTransformerWrapper +from mteb.requires_package import requires_package logger = logging.getLogger(__name__) @@ -139,19 +140,14 @@ def __init__( raise RuntimeError( f"sentence_transformers version {st_version} is lower than the required version 3.1.0" ) - try: - import einops # noqa: F401 - except ImportError: - raise ImportError( - "To use the jina-embeddings-v3 models `einops` is required. Please install it with `pip install mteb[jina]`." - ) - try: - import flash_attn # noqa: F401 - except ImportError: - logger.warning( - "Using flash_attn for jina-embeddings-v3 models is recommended. Please install it with `pip install mteb[flash_attention]`." - "Fallback to native implementation." - ) + requires_package(self, "jina", model, "pip install 'mteb[jina]'") + import einops # noqa: F401 + + requires_package( + self, "flash_attention", model, "pip install 'mteb[flash_attention]'" + ) + import flash_attn # noqa: F401 + super().__init__(model, revision, model_prompts, **kwargs) def encode( diff --git a/mteb/models/llm2clip_models.py b/mteb/models/llm2clip_models.py index 2a83f3ff85..b1676d200a 100644 --- a/mteb/models/llm2clip_models.py +++ b/mteb/models/llm2clip_models.py @@ -12,7 +12,7 @@ from mteb.encoder_interface import BatchedInput, PromptType from mteb.model_meta import ModelMeta from mteb.models.wrapper import Wrapper -from mteb.requires_package import requires_image_dependencies +from mteb.requires_package import requires_image_dependencies, requires_package MODEL2PROCESSOR = { "microsoft/LLM2CLIP-Openai-L-14-336": "openai/clip-vit-large-patch14-336", @@ -22,13 +22,11 @@ def llm2clip_loader(**kwargs): - try: - from llm2vec import LLM2Vec - except ImportError: - # https://github.com/baaivision/EVA/tree/master/EVA-CLIP#setup - raise ImportError( - "To use the LLM2CLIP models `llm2vec` is required. Please install it with `pip install llm2vec`." - ) + model_name = kwargs.get("model_name", "LLM2CLIP") + requires_package( + llm2clip_loader, "llm2vec", model_name, "pip install 'mteb[llm2vec]'" + ) + from llm2vec import LLM2Vec class LLM2CLIPWrapper(Wrapper): def __init__( diff --git a/mteb/models/llm2vec_models.py b/mteb/models/llm2vec_models.py index 70ed0e7fea..d826d8911a 100644 --- a/mteb/models/llm2vec_models.py +++ b/mteb/models/llm2vec_models.py @@ -10,6 +10,7 @@ from mteb.encoder_interface import BatchedInput, Encoder, PromptType from mteb.model_meta import ModelMeta, ScoringFunction from mteb.models.wrapper import Wrapper +from mteb.requires_package import requires_package, suggest_package logger = logging.getLogger(__name__) @@ -57,21 +58,21 @@ def __init__( *args, **kwargs, ): - try: - from llm2vec import LLM2Vec - except ImportError: - raise ImportError( - "To use the LLM2Vec models `llm2vec` is required. Please install it with `pip install llm2vec`." - ) + model_name = kwargs.get("model_name", "LLM2Vec") + requires_package(self, "llm2vec", model_name, "pip install 'mteb[llm2vec]'") + from llm2vec import LLM2Vec + extra_kwargs = {} - try: - import flash_attn # noqa + if suggest_package( + self, + "flash_attn", + model_name, + "pip install flash-attn --no-build-isolation", + ): + import flash_attn # noqa: F401 extra_kwargs["attn_implementation"] = "flash_attention_2" - except ImportError: - logger.warning( - "LLM2Vec models were trained with flash attention enabled. For optimal performance, please install the `flash_attn` package with `pip install flash-attn --no-build-isolation`." - ) + self.model_prompts = ( self.validate_task_to_prompt_name(model_prompts) if model_prompts else None ) diff --git a/mteb/models/moco_models.py b/mteb/models/moco_models.py index 3036f76b5c..c62b8232f3 100644 --- a/mteb/models/moco_models.py +++ b/mteb/models/moco_models.py @@ -10,14 +10,13 @@ from mteb.encoder_interface import BatchedInput, PromptType from mteb.model_meta import ModelMeta from mteb.models.wrapper import Wrapper -from mteb.requires_package import requires_image_dependencies +from mteb.requires_package import requires_image_dependencies, requires_package def mocov3_loader(**kwargs): - try: - import timm - except ImportError: - raise ImportError("Please install `pip install timm` to use MOCOv3 models.") + model_name = kwargs.get("model_name", "MOCOv3") + requires_package(mocov3_loader, "timm", model_name, "pip install 'mteb[timm]'") + import timm class MOCOv3Wrapper(Wrapper): """A wrapper class for MOCOv3 models that supports image encoding. diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index f597406048..820a41d94d 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -10,6 +10,7 @@ from mteb.model_meta import ModelMeta, ScoringFunction from mteb.models.bge_models import bge_training_data from mteb.models.wrapper import Wrapper +from mteb.requires_package import requires_package logger = logging.getLogger(__name__) @@ -26,12 +27,8 @@ def __init__( model_name: The Model2Vec model to load from HuggingFace Hub. **kwargs: Additional arguments to pass to the wrapper. """ - try: - from model2vec import StaticModel # type: ignore - except ModuleNotFoundError as e: - raise ModuleNotFoundError( - "To use the Model2Vec models `model2vec` is required. Please install it with `pip install mteb[model2vec]`." - ) from e + requires_package(self, "model2vec", model_name, "pip install 'mteb[model2vec]'") + from model2vec import StaticModel # type: ignore self.model_name = model_name self.model = StaticModel.from_pretrained(self.model_name) @@ -215,3 +212,118 @@ def encode( public_training_code="https://github.com/MinishLab/model2vec", public_training_data=None, ) + +pubmed_bert_100k = ModelMeta( + loader=Model2VecWrapper, + name="NeuML/pubmedbert-base-embeddings-100K", + languages=["eng_Latn"], + open_weights=True, + revision="bac5e3b12fb8c650e92a19c41b436732c4f16e9e", + release_date="2025-01-03", + n_parameters=1 * 1e5, + memory_usage_mb=0, + max_tokens=np.inf, + embed_dim=64, + license="apache-2.0", + similarity_fn_name="cosine", + framework=["NumPy"], + reference="https://huggingface.co/NeuML/pubmedbert-base-embeddings-100K", + use_instructions=False, + adapted_from="NeuML/pubmedbert-base-embeddings", + superseded_by=None, + training_datasets={}, + public_training_code="https://huggingface.co/NeuML/pubmedbert-base-embeddings-100K#training", + public_training_data="https://pubmed.ncbi.nlm.nih.gov/download/", +) + +pubmed_bert_500k = ModelMeta( + loader=Model2VecWrapper, + name="NeuML/pubmedbert-base-embeddings-500K", + languages=["eng_Latn"], + open_weights=True, + revision="34ba71e35c393fdad7ed695113f653feb407b16b", + release_date="2025-01-03", + n_parameters=5 * 1e5, + memory_usage_mb=2, + max_tokens=np.inf, + embed_dim=64, + license="apache-2.0", + similarity_fn_name="cosine", + framework=["NumPy"], + reference="https://huggingface.co/NeuML/pubmedbert-base-embeddings-500K", + use_instructions=False, + adapted_from="NeuML/pubmedbert-base-embeddings", + superseded_by=None, + training_datasets={}, + public_training_code="https://huggingface.co/NeuML/pubmedbert-base-embeddings-500K#training", + public_training_data="https://pubmed.ncbi.nlm.nih.gov/download/", +) + +pubmed_bert_1m = ModelMeta( + loader=Model2VecWrapper, + name="NeuML/pubmedbert-base-embeddings-1M", + languages=["eng_Latn"], + open_weights=True, + revision="2b7fed222594708da6d88bcda92ae9b434b7ddd1", + release_date="2025-01-03", + n_parameters=1 * 1e6, + memory_usage_mb=2, + max_tokens=np.inf, + embed_dim=64, + license="apache-2.0", + similarity_fn_name="cosine", + framework=["NumPy"], + reference="https://huggingface.co/NeuML/pubmedbert-base-embeddings-1M", + use_instructions=False, + adapted_from="NeuML/pubmedbert-base-embeddings", + superseded_by=None, + training_datasets={}, + public_training_code="https://huggingface.co/NeuML/pubmedbert-base-embeddings-1M#training", + public_training_data="https://pubmed.ncbi.nlm.nih.gov/download/", +) + +pubmed_bert_2m = ModelMeta( + loader=Model2VecWrapper, + name="NeuML/pubmedbert-base-embeddings-2M", + languages=["eng_Latn"], + open_weights=True, + revision="1d7bbe04d6713e425161146bfdc71473cbed498a", + release_date="2025-01-03", + n_parameters=1.95 * 1e6, + memory_usage_mb=7, + max_tokens=np.inf, + embed_dim=64, + license="apache-2.0", + similarity_fn_name="cosine", + framework=["NumPy"], + reference="https://huggingface.co/NeuML/pubmedbert-base-embeddings-2M", + use_instructions=False, + adapted_from="NeuML/pubmedbert-base-embeddings", + superseded_by=None, + training_datasets={}, + public_training_code="https://huggingface.co/NeuML/pubmedbert-base-embeddings-2M#training", + public_training_data="https://pubmed.ncbi.nlm.nih.gov/download/", +) + +pubmed_bert_8m = ModelMeta( + loader=Model2VecWrapper, + name="NeuML/pubmedbert-base-embeddings-8M", + languages=["eng_Latn"], + open_weights=True, + revision="387d350015e963744f4fafe56a574b7cd48646c9", + release_date="2025-01-03", + n_parameters=7.81 * 1e6, + memory_usage_mb=30, + max_tokens=np.inf, + embed_dim=256, + license="apache-2.0", + similarity_fn_name="cosine", + framework=["NumPy"], + reference="https://huggingface.co/NeuML/pubmedbert-base-embeddings-8M", + use_instructions=False, + adapted_from="NeuML/pubmedbert-base-embeddings", + superseded_by=None, + training_datasets={}, + public_training_code="https://huggingface.co/NeuML/pubmedbert-base-embeddings-8M#training", + public_training_data="https://pubmed.ncbi.nlm.nih.gov/download/", +) diff --git a/mteb/models/nb_sbert.py b/mteb/models/nb_sbert.py new file mode 100644 index 0000000000..36bb247fb4 --- /dev/null +++ b/mteb/models/nb_sbert.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +from mteb.model_meta import ModelMeta +from mteb.models.sentence_transformer_wrapper import ( + SentenceTransformerWrapper, +) + +nb_sbert = ModelMeta( + loader=SentenceTransformerWrapper, + name="NbAiLab/nb-sbert-base", + languages=["nno-Latn", "nob-Latn", "swe-Latn", "dan-Latn"], + open_weights=True, + revision="b95656350a076aeafd2d23763660f80655408cc6", + release_date="2022-11-23", + n_parameters=1_780_000_000, + memory_usage_mb=197, + embed_dim=4096, + license="apache-2.0", + max_tokens=75, + reference="https://huggingface.co/NbAiLab/nb-sbert-base", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data="https://huggingface.co/datasets/NbAiLab/mnli-norwegian", + training_datasets={}, +) diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index aeccb8692f..50304bda18 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -30,16 +30,16 @@ def __init__( requires_package( self, "openai", - "Openai text embedding", - install_instruction="pip install mteb[openai]", + model_name, + install_instruction="pip install 'mteb[openai]'", ) from openai import OpenAI requires_package( self, "tiktoken", - "Tiktoken package", - install_instruction="pip install mteb[openai]", + model_name, + install_instruction="pip install 'mteb[openai]'", ) import tiktoken diff --git a/mteb/models/openclip_models.py b/mteb/models/openclip_models.py index c2ee5bb2e6..8a6dbea4a9 100644 --- a/mteb/models/openclip_models.py +++ b/mteb/models/openclip_models.py @@ -10,14 +10,18 @@ from mteb.encoder_interface import BatchedInput, PromptType from mteb.model_meta import ModelMeta from mteb.models.wrapper import Wrapper -from mteb.requires_package import requires_image_dependencies +from mteb.requires_package import requires_image_dependencies, requires_package def openclip_loader(**kwargs): - try: - import open_clip - except ImportError: - raise ImportError("Please run `pip install open_clip_torch`.") + model_name = kwargs.get("model_name", "CLIP-ViT") + requires_package( + openclip_loader, + "open_clip_torch", + model_name, + "pip install 'mteb[open_clip_torch]'", + ) + import open_clip class OpenCLIPWrapper(Wrapper): def __init__( diff --git a/mteb/models/ops_moa_models.py b/mteb/models/ops_moa_models.py new file mode 100644 index 0000000000..d9bd9c1640 --- /dev/null +++ b/mteb/models/ops_moa_models.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +import numpy as np +from sentence_transformers import SentenceTransformer + +from mteb.model_meta import ModelMeta +from mteb.models.wrapper import Wrapper + + +class OPSWrapper(Wrapper): + def __init__(self, model_name, revision): + super().__init__() + self.model = SentenceTransformer( + model_name, revision=revision, trust_remote_code=True + ) + self.output_dim = 1536 + + def encode(self, sentences: list[str], **kwargs) -> np.ndarray: + embeddings = self.model.encode(sentences, **kwargs) + return embeddings[:, : self.output_dim] + + +ops_moa_conan_embedding = ModelMeta( + name="OpenSearch-AI/Ops-MoA-Conan-embedding-v1", + revision="46dcd58753f3daa920c66f89e47086a534089350", + release_date="2025-03-26", + languages=["zho_Hans"], + loader=OPSWrapper, + n_parameters=343 * 1e6, + memory_usage_mb=2e3, + max_tokens=512, + embed_dim=1536, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/OpenSearch-AI/Ops-MoA-Conan-embedding-v1", + similarity_fn_name="cosine", + use_instructions=False, + training_datasets={ + "T2Retrieval": ["train"], + "MMarcoRetrieval": ["train"], + "DuRetrieval": ["train"], + "CovidRetrieval": ["train"], + "CmedqaRetrieval": ["train"], + "EcomRetrieval": ["train"], + "MedicalRetrieval": ["train"], + "VideoRetrieval": ["train"], + }, + superseded_by=None, +) + +ops_moa_yuan_embedding = ModelMeta( + name="OpenSearch-AI/Ops-MoA-Yuan-embedding-1.0", + revision="23712d0766417b0eb88a2513c6e212a58b543268", + release_date="2025-03-26", + languages=["zho_Hans"], + loader=OPSWrapper, + n_parameters=343 * 1e6, + memory_usage_mb=2e3, + max_tokens=512, + embed_dim=1536, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/OpenSearch-AI/Ops-MoA-Yuan-embedding-1.0", + similarity_fn_name="cosine", + use_instructions=False, + training_datasets={ + "T2Retrieval": ["train"], + "MMarcoRetrieval": ["train"], + "DuRetrieval": ["train"], + "CovidRetrieval": ["train"], + "CmedqaRetrieval": ["train"], + "EcomRetrieval": ["train"], + "MedicalRetrieval": ["train"], + "VideoRetrieval": ["train"], + }, + superseded_by=None, +) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index b62a3cfb07..b06c925509 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -51,12 +51,14 @@ model2vec_models, moka_models, mxbai_models, + nb_sbert, no_instruct_sentence_models, nomic_models, nomic_models_vision, nvidia_models, openai_models, openclip_models, + ops_moa_models, piccolo_models, promptriever_models, qodo_models, @@ -64,8 +66,10 @@ repllama_models, rerankers_custom, rerankers_monot5_based, + richinfoai_models, ru_sentence_models, salesforce_models, + searchmap_models, sentence_transformers_models, siglip_models, sonar_models, @@ -124,6 +128,7 @@ nvidia_models, openai_models, openclip_models, + ops_moa_models, piccolo_models, gme_v_models, promptriever_models, @@ -132,8 +137,10 @@ repllama_models, rerankers_custom, rerankers_monot5_based, + richinfoai_models, ru_sentence_models, salesforce_models, + searchmap_models, sentence_transformers_models, siglip_models, vista_models, @@ -150,6 +157,7 @@ fa_models, ara_models, b1ade_models, + nb_sbert, ] MODEL_REGISTRY = {} diff --git a/mteb/models/repllama_models.py b/mteb/models/repllama_models.py index cab2b235b3..68917f0cc4 100644 --- a/mteb/models/repllama_models.py +++ b/mteb/models/repllama_models.py @@ -16,6 +16,7 @@ ScoringFunction, ) from mteb.models.wrapper import Wrapper +from mteb.requires_package import requires_package logger = logging.getLogger(__name__) @@ -31,12 +32,10 @@ def __init__( model_prompts: dict[str, str] | None = None, **kwargs, ): - try: - from peft import PeftModel - except ImportError: - raise ImportError( - "To use the RepLLaMA based models `peft` is required. Please install it with `pip install 'mteb[peft]'`." - ) + requires_package( + self, "peft", peft_model_name_or_path, "pip install 'mteb[peft]'" + ) + from peft import PeftModel self.base_model = AutoModel.from_pretrained( base_model_name_or_path, diff --git a/mteb/models/rerankers_custom.py b/mteb/models/rerankers_custom.py index 6787d97f16..a6bb7d0f03 100644 --- a/mteb/models/rerankers_custom.py +++ b/mteb/models/rerankers_custom.py @@ -11,6 +11,7 @@ from mteb.evaluation.evaluators.RetrievalEvaluator import DenseRetrievalExactSearch from mteb.model_meta import ModelMeta from mteb.models.bge_models import bge_m3_training_data +from mteb.requires_package import requires_package logger = logging.getLogger(__name__) @@ -60,12 +61,13 @@ def __init__( if self.fp_options: model_args["torch_dtype"] = self.fp_options - try: - from FlagEmbedding import FlagReranker - except ImportError: - raise ImportError( - "FlagEmbedding is not installed. Please install it via `pip install mteb[flagembedding]`" - ) + requires_package( + self, + "flagembedding", + model_name_or_path, + "pip install 'mteb[flagembedding]'", + ) + from FlagEmbedding import FlagReranker self.model = FlagReranker(model_name_or_path, use_fp16=True) diff --git a/mteb/models/richinfoai_models.py b/mteb/models/richinfoai_models.py new file mode 100644 index 0000000000..4644ec2f5b --- /dev/null +++ b/mteb/models/richinfoai_models.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from mteb.model_meta import ModelMeta +from mteb.models.bge_models import bge_full_data, bge_m3_training_data +from mteb.models.sentence_transformer_wrapper import ( + SentenceTransformerWrapper, +) +from mteb.models.stella_models import stella_zh_datasets + +ritrieve_zh_v1 = ModelMeta( + loader=SentenceTransformerWrapper, + name="richinfoai/ritrieve_zh_v1", + languages=["zho_Hans"], + open_weights=True, + revision="f8d5a707656c55705027678e311f9202c8ced12c", + release_date="2025-03-25", + n_parameters=int(326 * 1e6), + memory_usage_mb=1242, + embed_dim=1792, + license="mit", + max_tokens=512, + reference="https://huggingface.co/richinfoai/ritrieve_zh_v1", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=None, + public_training_data=None, + training_datasets={**stella_zh_datasets, **bge_full_data, **bge_m3_training_data}, +) diff --git a/mteb/models/searchmap_models.py b/mteb/models/searchmap_models.py new file mode 100644 index 0000000000..f90280a200 --- /dev/null +++ b/mteb/models/searchmap_models.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +from mteb.model_meta import ModelMeta +from mteb.models.sentence_transformer_wrapper import ( + SentenceTransformerWrapper, +) + +# Define task instructions with specific task names +task_instructions = { + "Classification": "Generate a representation for this text that can be used for classification:", + "Clustering": "Generate a representation for this text that can be used for clustering:", + "Retrieval": "Generate a representation for this text that can be used for retrieval:", + "STS": "Generate a representation for this text that can be used for semantic similarity:", + "PairClassification": "Generate a representation for this text pair that can be used for classification:", + "Reranking": "Generate a representation for this text that can be used for reranking:", + "Summarization": "Generate a representation for this text that can be used for summarization:", +} + +searchmap_preview = ModelMeta( + loader=SentenceTransformerWrapper, + loader_kwargs={ + "model_prompts": task_instructions, + }, + name="VPLabs/SearchMap_Preview", + revision="69de17ef48278ed08ba1a4e65ead8179912b696e", + languages=["eng_Latn"], + open_weights=True, + use_instructions=True, + release_date="2025-03-05", + n_parameters=435_000_000, + memory_usage_mb=1660, + embed_dim=4096, + license="mit", + max_tokens=8192, + reference="https://huggingface.co/VPLabs/SearchMap_Preview", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + public_training_code=None, + public_training_data=None, + training_datasets=None, + adapted_from="NovaSearch/stella_en_400M_v5", +) diff --git a/mteb/models/vlm2vec_models.py b/mteb/models/vlm2vec_models.py index 7a0070edf0..59d43b2e22 100644 --- a/mteb/models/vlm2vec_models.py +++ b/mteb/models/vlm2vec_models.py @@ -11,9 +11,12 @@ from mteb.encoder_interface import BatchedInput, PromptType from mteb.model_meta import ModelMeta -from mteb.requires_package import requires_image_dependencies +from mteb.requires_package import ( + requires_image_dependencies, + requires_package, + suggest_package, +) -logging.basicConfig(level=logging.WARNING) logger = logging.getLogger(__name__) EncodeTypes = Literal["query", "passage"] @@ -29,13 +32,16 @@ def __init__( **kwargs, ): requires_image_dependencies() - try: + if suggest_package( + self, + "flash_attn", + model_name, + "pip install flash-attn --no-build-isolation", + ): import flash_attn # noqa - from peft import LoraConfig, PeftModel # noqa - except ImportError: - logger.warning( - "VLM2Vec models were trained with flash attention enabled. For optimal performance, please install the `flash_attn` package with `pip install flash-attn --no-build-isolation`." - ) + + requires_package(self, "peft", model_name, "pip install 'mteb[peft]'") + from peft import LoraConfig, PeftModel # noqa self.pooling = "last" self.normalize = True diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index 77491eb9d1..4a3fca40ba 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -79,7 +79,7 @@ def __init__( model_prompts: dict[str, str] | None = None, **kwargs, ) -> None: - requires_package(self, "voyageai", "Voyage") + requires_package(self, "voyageai", model_name, "pip install 'mteb[voyageai]'") import voyageai self._client = voyageai.Client(max_retries=max_retries) diff --git a/mteb/models/voyage_v.py b/mteb/models/voyage_v.py index c270d5d363..e7169d24f8 100644 --- a/mteb/models/voyage_v.py +++ b/mteb/models/voyage_v.py @@ -11,7 +11,7 @@ from mteb.encoder_interface import BatchedInput, PromptType from mteb.model_meta import ModelMeta, ScoringFunction -from mteb.requires_package import requires_image_dependencies +from mteb.requires_package import requires_image_dependencies, requires_package def downsample_image( @@ -46,16 +46,15 @@ def downsample_image( def voyage_v_loader(**kwargs): - try: - import voyageai - except ImportError: - raise ImportError("To use voyage models, please run `pip install -U voyageai`.") - try: - from tenacity import retry, stop_after_attempt, wait_exponential - except ImportError: - raise ImportError( - "please run `pip install tenacity` to use exponential backoff." - ) + model_name = kwargs.get("model_name", "Voyage vision") + requires_package( + voyage_v_loader, + "voyageai and tenacity", + model_name, + "pip install 'mteb[voyage_v]'", + ) + import voyageai + from tenacity import retry, stop_after_attempt, wait_exponential class VoyageMultiModalModelWrapper: def __init__( diff --git a/mteb/requires_package.py b/mteb/requires_package.py index d261acdffb..22b6ddebf7 100644 --- a/mteb/requires_package.py +++ b/mteb/requires_package.py @@ -1,6 +1,9 @@ from __future__ import annotations import importlib.util +import logging + +logger = logging.getLogger(__name__) def _is_package_available(pkg_name: str) -> bool: @@ -24,6 +27,22 @@ def requires_package( ) +def suggest_package( + obj, package_name: str, model_name: str, install_instruction: str +) -> bool: + """Check if a package is available and log a warning with installation instructions if it's not. + Unlike requires_package, this doesn't raise an error but returns True if the package is available. + """ + if not _is_package_available(package_name): + name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__ + logger.warning( + f"{name} can benefit from the `{package_name}` library but it was not found in your environment. " + + f"{model_name} models were trained with flash attention enabled. For optimal performance, please install the `{package_name}` package with `{install_instruction}`." + ) + return False + return True + + def requires_image_dependencies() -> None: if not _is_package_available("torchvision"): raise ImportError( diff --git a/mteb/tasks/Clustering/kor/KlueMrcDomainClustering.py b/mteb/tasks/Clustering/kor/KlueMrcDomainClustering.py index def8cd0c91..6fb5e662ab 100644 --- a/mteb/tasks/Clustering/kor/KlueMrcDomainClustering.py +++ b/mteb/tasks/Clustering/kor/KlueMrcDomainClustering.py @@ -9,7 +9,7 @@ class KlueMrcDomainClustering(AbsTaskClustering): metadata = TaskMetadata( name="KlueMrcDomainClustering", - description="this dataset is a processed and redistributed version of the KLUE-MRC dataset. Domain: Game / Media / Automotive / Finance / Real Estate / Education ", + description="this dataset is a processed and redistributed version of the KLUE-MRC dataset. Domain: Game / Media / Automotive / Finance / Real Estate / Education", reference="https://huggingface.co/datasets/on-and-on/clustering_klue_mrc_context_domain", type="Clustering", category="t2c", @@ -36,6 +36,7 @@ class KlueMrcDomainClustering(AbsTaskClustering): archivePrefix={arXiv}, primaryClass={cs.CL}, }""", + prompt="Identify the topic or theme of the given texts", ) def dataset_transform(self): diff --git a/mteb/tasks/Clustering/kor/KlueYnatMrcCategoryClustering.py b/mteb/tasks/Clustering/kor/KlueYnatMrcCategoryClustering.py index 0d2d1ee9f7..af68013403 100644 --- a/mteb/tasks/Clustering/kor/KlueYnatMrcCategoryClustering.py +++ b/mteb/tasks/Clustering/kor/KlueYnatMrcCategoryClustering.py @@ -9,7 +9,7 @@ class KlueYnatMrcCategoryClustering(AbsTaskClustering): metadata = TaskMetadata( name="KlueYnatMrcCategoryClustering", - description="this dataset is a processed and redistributed version of the KLUE-Ynat & KLUE-MRC dataset. News_category: IT/Science, Sports, Media/Culture, Ecomomy/Finance, Real Estate ", + description="this dataset is a processed and redistributed version of the KLUE-Ynat & KLUE-MRC dataset. News_category: IT/Science, Sports, Media/Culture, Ecomomy/Finance, Real Estate", reference="https://huggingface.co/datasets/on-and-on/clustering_klue_mrc_ynat_title", type="Clustering", category="t2t", @@ -36,6 +36,7 @@ class KlueYnatMrcCategoryClustering(AbsTaskClustering): archivePrefix={arXiv}, primaryClass={cs.CL}, }""", + prompt="Identify the topic or theme of the given texts", ) def dataset_transform(self): diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/__init__.py b/mteb/tasks/Image/Any2AnyMultiChoice/__init__.py index 914bfc0725..56ec595f3f 100644 --- a/mteb/tasks/Image/Any2AnyMultiChoice/__init__.py +++ b/mteb/tasks/Image/Any2AnyMultiChoice/__init__.py @@ -3,23 +3,17 @@ from .eng import ( BLINKIT2IMultiChoice, BLINKIT2TMultiChoice, - ImageCoDeT2IMultiChoice, - ROxfordEasyI2IMultiChoice, - ROxfordHardI2IMultiChoice, - ROxfordMediumI2IMultiChoice, - RParisEasyI2IMultiChoice, - RParisHardI2IMultiChoice, - RParisMediumI2IMultiChoice, + CVBenchCount, + CVBenchDepth, + CVBenchDistance, + CVBenchRelation, ) __all__ = [ - "ImageCoDeT2IMultiChoice", + "CVBenchCount", + "CVBenchDepth", + "CVBenchDistance", + "CVBenchRelation", "BLINKIT2IMultiChoice", "BLINKIT2TMultiChoice", - "ROxfordEasyI2IMultiChoice", - "ROxfordHardI2IMultiChoice", - "ROxfordMediumI2IMultiChoice", - "RParisEasyI2IMultiChoice", - "RParisHardI2IMultiChoice", - "RParisMediumI2IMultiChoice", ] diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2IMultiChoice.py b/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2IMultiChoice.py index 2ecdd6fc05..538b26464e 100644 --- a/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2IMultiChoice.py +++ b/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2IMultiChoice.py @@ -14,7 +14,7 @@ class BLINKIT2IMultiChoice(AbsTaskAny2AnyMultiChoice): "revision": "a9f994925551c14503d00d86f1307bac6e2ead6a", "trust_remote_code": True, }, - type="VisionCentric", + type="VisionCentricQA", category="it2i", eval_splits=["test"], eval_langs=["eng-Latn"], diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2TMultiChoice.py b/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2TMultiChoice.py index 719a381876..ea101ff1d9 100644 --- a/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2TMultiChoice.py +++ b/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2TMultiChoice.py @@ -13,7 +13,7 @@ class BLINKIT2TMultiChoice(AbsTaskAny2AnyMultiChoice): "path": "JamieSJS/blink-it2t-multi", "revision": "bc8f4c7f62450a4ceb737c8339061cf87aea42d5", }, - type="VisionCentric", + type="VisionCentricQA", category="it2t", eval_splits=["test"], eval_langs=["eng-Latn"], diff --git a/mteb/tasks/Image/Any2TextMultipleChoice/eng/CVBench.py b/mteb/tasks/Image/Any2AnyMultiChoice/eng/CVBench.py similarity index 59% rename from mteb/tasks/Image/Any2TextMultipleChoice/eng/CVBench.py rename to mteb/tasks/Image/Any2AnyMultiChoice/eng/CVBench.py index 847d583b58..14b328b30b 100644 --- a/mteb/tasks/Image/Any2TextMultipleChoice/eng/CVBench.py +++ b/mteb/tasks/Image/Any2AnyMultiChoice/eng/CVBench.py @@ -1,20 +1,93 @@ from __future__ import annotations -import datasets +from datasets import Dataset, load_dataset -from mteb.abstasks.Image.AbsTaskAny2TextMultipleChoice import ( - AbsTaskAny2TextMultipleChoice, -) +from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import AbsTaskAny2AnyMultiChoice from mteb.abstasks.TaskMetadata import TaskMetadata +def _load_data( + path: str, + splits: list[str], + cache_dir: str = None, + revision: str = None, + subtask: str = "Count", +): + corpus = {} + queries = {} + relevant_docs = {} + + dataset = load_dataset( + path, + cache_dir=cache_dir, + revision=revision, + ) + dataset = dataset.filter(lambda example: example["task"] == subtask) + for split in splits: + split_dataset = dataset[split] + + split_dataset = split_dataset.map( + transform_choices, + remove_columns=[ + "idx", + "type", + "filename", + "source", + "source_dataset", + "source_filename", + "target_class", + "target_size", + "bbox", + "prompt", + ], + ) + + queries[split] = split_dataset.map( + lambda x, idx: { + "id": f"query-{split}-{idx}", + "text": x["question"], + "modality": "image,text", + }, + with_indices=True, + remove_columns=["answer", "choices", "task"], + ) + + corpus_element = [] + corpus_to_id = {} + relevant_docs[split] = {} + + for idx, entry in enumerate(split_dataset): + choices = entry["choices"] + answer = choices[entry["answer"]] + + query_id = f"query-{split}-{idx}" + + for choice in choices: + if choice not in corpus_to_id: + corpus_id = len(corpus_element) + corpus_element.append(choice) + corpus_to_id[choice] = f"corpus-{split}-{corpus_id}" + + is_relevant = 1 if choice == answer else 0 + if query_id not in relevant_docs[split]: + relevant_docs[split][query_id] = {} + relevant_docs[split][query_id][corpus_to_id[choice]] = is_relevant + corpus_ids = [corpus_id for _, corpus_id in corpus_to_id.items()] + docs = [doc for doc, _ in corpus_to_id.items()] + corpus_records = [] + for corpus_id, doc in zip(corpus_ids, docs): + corpus_records.append({"id": corpus_id, "text": doc, "modality": "text"}) + corpus[split] = Dataset.from_list(corpus_records) + return corpus, queries, relevant_docs + + def transform_choices(example): mapping = {"(A)": 0, "(B)": 1, "(C)": 2, "(D)": 3, "(E)": 4, "(F)": 5} example["answer"] = mapping[example["answer"]] return example -class CVBenchCount(AbsTaskAny2TextMultipleChoice): +class CVBenchCount(AbsTaskAny2AnyMultiChoice): metadata = TaskMetadata( name="CVBenchCount", description="count the number of objects in the image.", @@ -23,7 +96,7 @@ class CVBenchCount(AbsTaskAny2TextMultipleChoice): "path": "nyu-visionx/CV-Bench", "revision": "22409a927ab5cf68e3655023d51694587455fc99", }, - type="VisionCentric", + type="VisionCentricQA", category="it2t", eval_splits=["test"], eval_langs=["eng-Latn"], @@ -34,7 +107,7 @@ class CVBenchCount(AbsTaskAny2TextMultipleChoice): license="mit", annotations_creators="derived", dialect=[], - modalities=["text", "image"], + modalities=["image", "text"], sample_creation="found", bibtex_citation="""@article{tong2024cambrian, title={Cambrian-1: A fully open, vision-centric exploration of multimodal llms}, @@ -45,28 +118,17 @@ class CVBenchCount(AbsTaskAny2TextMultipleChoice): ) def load_data(self, **kwargs): - self.dataset = datasets.load_dataset(**self.metadata.dataset) - self.dataset_transform() - self.dataset = self.dataset.filter(lambda example: example["task"] == "Count") - self.dataset = self.dataset.map( - transform_choices, - remove_columns=[ - "idx", - "type", - "filename", - "source", - "source_dataset", - "source_filename", - "target_class", - "target_size", - "bbox", - "prompt", - ], + self.corpus, self.queries, self.relevant_docs = _load_data( + path=self.metadata.dataset["path"], + splits=self.metadata.eval_splits, + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + subtask="Count", ) self.data_loaded = True -class CVBenchRelation(AbsTaskAny2TextMultipleChoice): +class CVBenchRelation(AbsTaskAny2AnyMultiChoice): metadata = TaskMetadata( name="CVBenchRelation", description="decide the relation of the objects in the image.", @@ -75,7 +137,7 @@ class CVBenchRelation(AbsTaskAny2TextMultipleChoice): "path": "nyu-visionx/CV-Bench", "revision": "22409a927ab5cf68e3655023d51694587455fc99", }, - type="VisionCentric", + type="VisionCentricQA", category="it2t", eval_splits=["test"], eval_langs=["eng-Latn"], @@ -97,30 +159,17 @@ class CVBenchRelation(AbsTaskAny2TextMultipleChoice): ) def load_data(self, **kwargs): - self.dataset = datasets.load_dataset(**self.metadata.dataset) - self.dataset_transform() - self.dataset = self.dataset.filter( - lambda example: example["task"] == "Relation" - ) - self.dataset = self.dataset.map( - transform_choices, - remove_columns=[ - "idx", - "type", - "filename", - "source", - "source_dataset", - "source_filename", - "target_class", - "target_size", - "bbox", - "prompt", - ], + self.corpus, self.queries, self.relevant_docs = _load_data( + path=self.metadata.dataset["path"], + splits=self.metadata.eval_splits, + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + subtask="Relation", ) self.data_loaded = True -class CVBenchDepth(AbsTaskAny2TextMultipleChoice): +class CVBenchDepth(AbsTaskAny2AnyMultiChoice): metadata = TaskMetadata( name="CVBenchDepth", description="judge the depth of the objects in the image with similarity matching.", @@ -129,7 +178,7 @@ class CVBenchDepth(AbsTaskAny2TextMultipleChoice): "path": "nyu-visionx/CV-Bench", "revision": "22409a927ab5cf68e3655023d51694587455fc99", }, - type="VisionCentric", + type="VisionCentricQA", category="it2t", eval_splits=["test"], eval_langs=["eng-Latn"], @@ -151,28 +200,17 @@ class CVBenchDepth(AbsTaskAny2TextMultipleChoice): ) def load_data(self, **kwargs): - self.dataset = datasets.load_dataset(**self.metadata.dataset) - self.dataset_transform() - self.dataset = self.dataset.filter(lambda example: example["task"] == "Depth") - self.dataset = self.dataset.map( - transform_choices, - remove_columns=[ - "idx", - "type", - "filename", - "source", - "source_dataset", - "source_filename", - "target_class", - "target_size", - "bbox", - "prompt", - ], + self.corpus, self.queries, self.relevant_docs = _load_data( + path=self.metadata.dataset["path"], + splits=self.metadata.eval_splits, + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + subtask="Depth", ) self.data_loaded = True -class CVBenchDistance(AbsTaskAny2TextMultipleChoice): +class CVBenchDistance(AbsTaskAny2AnyMultiChoice): metadata = TaskMetadata( name="CVBenchDistance", description="judge the distance of the objects in the image with similarity matching.", @@ -181,7 +219,7 @@ class CVBenchDistance(AbsTaskAny2TextMultipleChoice): "path": "nyu-visionx/CV-Bench", "revision": "22409a927ab5cf68e3655023d51694587455fc99", }, - type="VisionCentric", + type="VisionCentricQA", category="it2t", eval_splits=["test"], eval_langs=["eng-Latn"], @@ -203,24 +241,11 @@ class CVBenchDistance(AbsTaskAny2TextMultipleChoice): ) def load_data(self, **kwargs): - self.dataset = datasets.load_dataset(**self.metadata.dataset) - self.dataset_transform() - self.dataset = self.dataset.filter( - lambda example: example["task"] == "Distance" - ) - self.dataset = self.dataset.map( - transform_choices, - remove_columns=[ - "idx", - "type", - "filename", - "source", - "source_dataset", - "source_filename", - "target_class", - "target_size", - "bbox", - "prompt", - ], + self.corpus, self.queries, self.relevant_docs = _load_data( + path=self.metadata.dataset["path"], + splits=self.metadata.eval_splits, + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + subtask="Distance", ) self.data_loaded = True diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/eng/ImageCoDeT2IMultiChoice.py b/mteb/tasks/Image/Any2AnyMultiChoice/eng/ImageCoDeT2IMultiChoice.py deleted file mode 100644 index a41f77ab0e..0000000000 --- a/mteb/tasks/Image/Any2AnyMultiChoice/eng/ImageCoDeT2IMultiChoice.py +++ /dev/null @@ -1,36 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import AbsTaskAny2AnyMultiChoice -from mteb.abstasks.TaskMetadata import TaskMetadata - - -class ImageCoDeT2IMultiChoice(AbsTaskAny2AnyMultiChoice): - metadata = TaskMetadata( - name="ImageCoDeT2IMultiChoice", - description="Identify the correct image from a set of similar images based on a precise caption.", - reference="https://aclanthology.org/2022.acl-long.241.pdf", - dataset={ - "path": "JamieSJS/imagecode-multi", - "revision": "d28adfd8b34fefa546fdf94bdc352622b2575f6c", - }, - type="Compositionality", - category="it2i", - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=("2022-05-22", "2022-05-27"), # conference dates - domains=["Web", "Written"], - task_subtypes=["Image Text Retrieval"], - license="cc-by-sa-4.0", - annotations_creators="derived", - dialect=[], - modalities=["text", "image"], - sample_creation="found", - bibtex_citation="""@article{krojer2022image, - title={Image retrieval from contextual descriptions}, - author={Krojer, Benno and Adlakha, Vaibhav and Vineet, Vibhav and Goyal, Yash and Ponti, Edoardo and Reddy, Siva}, - journal={arXiv preprint arXiv:2203.15867}, - year={2022} -} -""", - ) diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/eng/ROxfordI2IMultiChoice.py b/mteb/tasks/Image/Any2AnyMultiChoice/eng/ROxfordI2IMultiChoice.py deleted file mode 100644 index 215200d6df..0000000000 --- a/mteb/tasks/Image/Any2AnyMultiChoice/eng/ROxfordI2IMultiChoice.py +++ /dev/null @@ -1,108 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import AbsTaskAny2AnyMultiChoice -from mteb.abstasks.TaskMetadata import TaskMetadata - - -# NOTE: These tasks are marked as Any2AnyRetrieval types they are the correct implementations of ROxford retrieval and RParis retrieval -# (as it requires masking out the different docs in corpus for every query). This aligns with the MIEB papeer. -class ROxfordEasyI2IMultiChoice(AbsTaskAny2AnyMultiChoice): - metadata = TaskMetadata( - name="ROxfordEasyI2IMultiChoice", - description="Retrieve photos of landmarks in Oxford, UK.", - reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html", - dataset={ - "path": "JamieSJS/r-oxford-easy-multi", - "revision": "4c167c3ce529f19457c9b8e694258cc6cf8e7cc7", - }, - type="Any2AnyRetrieval", - category="i2i", - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=("2009-01-01", "2010-04-01"), - domains=["Web"], - task_subtypes=["Object recognition"], - license="not specified", - annotations_creators="derived", - dialect=[], - modalities=["image"], - sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting oxford and paris: Large-scale image MultiChoice benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} -} - """, - ) - skip_first_result = False - - -class ROxfordMediumI2IMultiChoice(AbsTaskAny2AnyMultiChoice): - metadata = TaskMetadata( - name="ROxfordMediumI2IMultiChoice", - description="Retrieve photos of landmarks in Oxford, UK.", - reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html", - dataset={ - "path": "JamieSJS/r-oxford-medium-multi", - "revision": "83bd440268e200a4f60313070618e3f45000fa94", - }, - type="Any2AnyRetrieval", - category="i2i", - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=("2009-01-01", "2010-04-01"), - domains=["Web"], - task_subtypes=["Object recognition"], - license="not specified", - annotations_creators="derived", - dialect=[], - modalities=["image"], - sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting oxford and paris: Large-scale image MultiChoice benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} -} - """, - ) - skip_first_result = False - - -class ROxfordHardI2IMultiChoice(AbsTaskAny2AnyMultiChoice): - metadata = TaskMetadata( - name="ROxfordHardI2IMultiChoice", - description="Retrieve photos of landmarks in Oxford, UK.", - reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html", - dataset={ - "path": "JamieSJS/r-oxford-hard-multi", - "revision": "fc7c4ae6655b1e6b132f3b262a359acef42dfce8", - }, - type="Any2AnyRetrieval", - category="i2i", - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=("2009-01-01", "2010-04-01"), - domains=["Web"], - task_subtypes=["Object recognition"], - license="not specified", - annotations_creators="derived", - dialect=[], - modalities=["image"], - sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting oxford and paris: Large-scale image MultiChoice benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} -} - """, - ) - skip_first_result = False diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/eng/RParisI2IMultiChoice.py b/mteb/tasks/Image/Any2AnyMultiChoice/eng/RParisI2IMultiChoice.py deleted file mode 100644 index a759689d3d..0000000000 --- a/mteb/tasks/Image/Any2AnyMultiChoice/eng/RParisI2IMultiChoice.py +++ /dev/null @@ -1,108 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import AbsTaskAny2AnyMultiChoice -from mteb.abstasks.TaskMetadata import TaskMetadata - - -# NOTE: These tasks are marked as Any2AnyRetrieval types they are the correct implementations of ROxford retrieval and RParis retrieval -# (as it requires masking out the different docs in corpus for every query). This aligns with the MIEB papeer. -class RParisEasyI2IMultiChoice(AbsTaskAny2AnyMultiChoice): - metadata = TaskMetadata( - name="RParisEasyI2IMultiChoice", - description="Retrieve photos of landmarks in Paris, UK.", - reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html", - dataset={ - "path": "JamieSJS/r-paris-easy-multi", - "revision": "db94b5afd0014ab8c978f20a0fbcc52da1612a08", - }, - type="Any2AnyRetrieval", - category="i2i", - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=("2009-01-01", "2010-04-01"), - domains=["Web"], - task_subtypes=["Object recognition"], - license="not specified", - annotations_creators="derived", - dialect=[], - modalities=["image"], - sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting paris and paris: Large-scale image MultiChoice benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} -} - """, - ) - skip_first_result = False - - -class RParisMediumI2IMultiChoice(AbsTaskAny2AnyMultiChoice): - metadata = TaskMetadata( - name="RParisMediumI2IMultiChoice", - description="Retrieve photos of landmarks in Paris, UK.", - reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html", - dataset={ - "path": "JamieSJS/r-paris-medium-multi", - "revision": "372c79fc823e1cebc1d55f8e0039aa239285e177", - }, - type="Any2AnyRetrieval", - category="i2i", - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=("2009-01-01", "2010-04-01"), - domains=["Web"], - task_subtypes=["Object recognition"], - license="not specified", - annotations_creators="derived", - dialect=[], - modalities=["image"], - sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting paris and paris: Large-scale image MultiChoice benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} -} - """, - ) - skip_first_result = False - - -class RParisHardI2IMultiChoice(AbsTaskAny2AnyMultiChoice): - metadata = TaskMetadata( - name="RParisHardI2IMultiChoice", - description="Retrieve photos of landmarks in Paris, UK.", - reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html", - dataset={ - "path": "JamieSJS/r-paris-hard-multi", - "revision": "4e5997e48fb2f2f8bf1c8973851dedeb17e09a83", - }, - type="Any2AnyRetrieval", - category="i2i", - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=("2009-01-01", "2010-04-01"), - domains=["Web"], - task_subtypes=["Object recognition"], - license="not specified", - annotations_creators="derived", - dialect=[], - modalities=["image"], - sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting paris and paris: Large-scale image MultiChoice benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} -} - """, - ) - skip_first_result = False diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/eng/__init__.py b/mteb/tasks/Image/Any2AnyMultiChoice/eng/__init__.py index a2adab646c..ceb3f41a0f 100644 --- a/mteb/tasks/Image/Any2AnyMultiChoice/eng/__init__.py +++ b/mteb/tasks/Image/Any2AnyMultiChoice/eng/__init__.py @@ -2,26 +2,13 @@ from .BLINKIT2IMultiChoice import BLINKIT2IMultiChoice from .BLINKIT2TMultiChoice import BLINKIT2TMultiChoice -from .ImageCoDeT2IMultiChoice import ImageCoDeT2IMultiChoice -from .ROxfordI2IMultiChoice import ( - ROxfordEasyI2IMultiChoice, - ROxfordHardI2IMultiChoice, - ROxfordMediumI2IMultiChoice, -) -from .RParisI2IMultiChoice import ( - RParisEasyI2IMultiChoice, - RParisHardI2IMultiChoice, - RParisMediumI2IMultiChoice, -) +from .CVBench import CVBenchCount, CVBenchDepth, CVBenchDistance, CVBenchRelation __all__ = [ - "ImageCoDeT2IMultiChoice", + "CVBenchCount", + "CVBenchDepth", + "CVBenchDistance", + "CVBenchRelation", "BLINKIT2IMultiChoice", "BLINKIT2TMultiChoice", - "ROxfordEasyI2IMultiChoice", - "ROxfordHardI2IMultiChoice", - "ROxfordMediumI2IMultiChoice", - "RParisEasyI2IMultiChoice", - "RParisHardI2IMultiChoice", - "RParisMediumI2IMultiChoice", ] diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/ROxfordI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/ROxfordI2IRetrieval.py index 894aeae763..743ed2333d 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/ROxfordI2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/ROxfordI2IRetrieval.py @@ -1,23 +1,28 @@ from __future__ import annotations +import logging + +from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import MultiChoiceEvaluationMixin from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval from mteb.abstasks.TaskMetadata import TaskMetadata +logger = logging.getLogger(__name__) + -class ROxfordEasyI2IRetrieval(AbsTaskAny2AnyRetrieval): +class ROxfordEasyI2IRetrieval(MultiChoiceEvaluationMixin, AbsTaskAny2AnyRetrieval): metadata = TaskMetadata( name="ROxfordEasyI2IRetrieval", description="Retrieve photos of landmarks in Oxford, UK.", reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html", dataset={ - "path": "JamieSJS/r-oxford-easy", - "revision": "b71b5f67a93aa63761b79a67bcf28bd2ae590902", + "path": "JamieSJS/r-oxford-easy-multi", + "revision": "4c167c3ce529f19457c9b8e694258cc6cf8e7cc7", }, type="Any2AnyRetrieval", category="i2i", eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="cv_recall_at_1", + main_score="map_at_5", date=("2009-01-01", "2010-04-01"), domains=["Web"], task_subtypes=["Object recognition"], @@ -38,20 +43,20 @@ class ROxfordEasyI2IRetrieval(AbsTaskAny2AnyRetrieval): skip_first_result = False -class ROxfordMediumI2IRetrieval(AbsTaskAny2AnyRetrieval): +class ROxfordMediumI2IRetrieval(MultiChoiceEvaluationMixin, AbsTaskAny2AnyRetrieval): metadata = TaskMetadata( name="ROxfordMediumI2IRetrieval", description="Retrieve photos of landmarks in Oxford, UK.", reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html", dataset={ - "path": "JamieSJS/r-oxford-medium", - "revision": "1dfb86730ee4b3f49b441f4896d473c83eb5ff0d", + "path": "JamieSJS/r-oxford-medium-multi", + "revision": "83bd440268e200a4f60313070618e3f45000fa94", }, type="Any2AnyRetrieval", category="i2i", eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="cv_recall_at_1", + main_score="map_at_5", date=("2009-01-01", "2010-04-01"), domains=["Web"], task_subtypes=["Object recognition"], @@ -72,20 +77,20 @@ class ROxfordMediumI2IRetrieval(AbsTaskAny2AnyRetrieval): skip_first_result = False -class ROxfordHardI2IRetrieval(AbsTaskAny2AnyRetrieval): +class ROxfordHardI2IRetrieval(MultiChoiceEvaluationMixin, AbsTaskAny2AnyRetrieval): metadata = TaskMetadata( name="ROxfordHardI2IRetrieval", description="Retrieve photos of landmarks in Oxford, UK.", reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html", dataset={ - "path": "JamieSJS/r-oxford-hard", - "revision": "f71ab9d4aabcda93d55a7e65edfb3a34767d89e6", + "path": "JamieSJS/r-oxford-hard-multi", + "revision": "fc7c4ae6655b1e6b132f3b262a359acef42dfce8", }, type="Any2AnyRetrieval", category="i2i", eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="cv_recall_at_1", + main_score="map_at_5", date=("2009-01-01", "2010-04-01"), domains=["Web"], task_subtypes=["Object recognition"], diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/RParisI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/RParisI2IRetrieval.py index 2bf7dca3c6..846786bbc6 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/RParisI2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/RParisI2IRetrieval.py @@ -1,23 +1,24 @@ from __future__ import annotations +from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import MultiChoiceEvaluationMixin from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval from mteb.abstasks.TaskMetadata import TaskMetadata -class RParisEasyI2IRetrieval(AbsTaskAny2AnyRetrieval): +class RParisEasyI2IRetrieval(MultiChoiceEvaluationMixin, AbsTaskAny2AnyRetrieval): metadata = TaskMetadata( name="RParisEasyI2IRetrieval", - description="Retrieve photos of landmarks in Paris, France.", + description="Retrieve photos of landmarks in Paris, UK.", reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html", dataset={ - "path": "JamieSJS/r-paris-easy", - "revision": "7d821ddebcb30ad343133e3a81e23347ac2a08a8", + "path": "JamieSJS/r-paris-easy-multi", + "revision": "db94b5afd0014ab8c978f20a0fbcc52da1612a08", }, type="Any2AnyRetrieval", category="i2i", eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="cv_recall_at_1", + main_score="map_at_5", date=("2009-01-01", "2010-04-01"), domains=["Web"], task_subtypes=["Object recognition"], @@ -38,20 +39,20 @@ class RParisEasyI2IRetrieval(AbsTaskAny2AnyRetrieval): skip_first_result = False -class RParisMediumI2IRetrieval(AbsTaskAny2AnyRetrieval): +class RParisMediumI2IRetrieval(MultiChoiceEvaluationMixin, AbsTaskAny2AnyRetrieval): metadata = TaskMetadata( name="RParisMediumI2IRetrieval", - description="Retrieve photos of landmarks in Paris, France.", + description="Retrieve photos of landmarks in Paris, UK.", reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html", dataset={ - "path": "JamieSJS/r-paris-medium", - "revision": "3d959815e102785efd628170281f1e65561b03d2", + "path": "JamieSJS/r-paris-medium-multi", + "revision": "372c79fc823e1cebc1d55f8e0039aa239285e177", }, type="Any2AnyRetrieval", category="i2i", eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="cv_recall_at_1", + main_score="map_at_5", date=("2009-01-01", "2010-04-01"), domains=["Web"], task_subtypes=["Object recognition"], @@ -72,20 +73,20 @@ class RParisMediumI2IRetrieval(AbsTaskAny2AnyRetrieval): skip_first_result = False -class RParisHardI2IRetrieval(AbsTaskAny2AnyRetrieval): +class RParisHardI2IRetrieval(MultiChoiceEvaluationMixin, AbsTaskAny2AnyRetrieval): metadata = TaskMetadata( name="RParisHardI2IRetrieval", - description="Retrieve photos of landmarks in Paris, France.", + description="Retrieve photos of landmarks in Paris, UK.", reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html", dataset={ - "path": "JamieSJS/r-paris-hard", - "revision": "d3e0adf4e942446c04427511ccce281c86861248", + "path": "JamieSJS/r-paris-hard-multi", + "revision": "4e5997e48fb2f2f8bf1c8973851dedeb17e09a83", }, type="Any2AnyRetrieval", category="i2i", eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="cv_recall_at_1", + main_score="map_at_5", date=("2009-01-01", "2010-04-01"), domains=["Web"], task_subtypes=["Object recognition"], diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/VdrMultilingualRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/VdrMultilingualRetrieval.py new file mode 100644 index 0000000000..8ebc7c30b3 --- /dev/null +++ b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/VdrMultilingualRetrieval.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +import datasets +from datasets import Dataset, DatasetDict + +from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +_LANGS = { + "en": ["eng-Latn"], + "es": ["spa-Latn"], + "fr": ["fra-Latn"], + "de": ["deu-Latn"], + "it": ["ita-Latn"], +} +_EVAL_SPLIT = "train" # It is test split only, but given name as train on HF + + +def _load_vdr_multilingual_data( + path: str, + langs: list, + split: str, + cache_dir: str = None, + revision: str = None, + trust_remote_code: bool = False, +): + """Load data from the VDR Multilingual dataset.""" + corpus_dict = {} + queries_dict = {} + relevant_docs_dict = {} + + for lang_code in langs: + dataset = datasets.load_dataset( + path=path, + name=lang_code, + split=split, + cache_dir=cache_dir, + revision=revision, + trust_remote_code=trust_remote_code, + ) + + corpus_records = [] + queries_records = [] + relevant_dict = {} + + for idx, record in enumerate(dataset): + doc_id = f"doc-{record['id']}" + query_id = f"query-{record['id']}" + has_query = record.get("query") is not None + + corpus_records.append( + { + "id": doc_id, + "image": record.get("image"), + "modality": "image", + } + ) + + if has_query: + queries_records.append( + { + "id": query_id, + "text": record.get("query", ""), + "modality": "text", + } + ) + + if query_id not in relevant_dict: + relevant_dict[query_id] = {} + relevant_dict[query_id][doc_id] = 1 + + if lang_code not in corpus_dict: + corpus_dict[lang_code] = {} + if lang_code not in queries_dict: + queries_dict[lang_code] = {} + if lang_code not in relevant_docs_dict: + relevant_docs_dict[lang_code] = {} + + corpus_dict[lang_code][split] = Dataset.from_list(corpus_records) + queries_dict[lang_code][split] = Dataset.from_list(queries_records) + relevant_docs_dict[lang_code][split] = relevant_dict + + corpus_dataset_dict = {} + queries_dataset_dict = {} + relevant_docs_dataset_dict = {} + + for lang_code in langs: + if ( + lang_code in corpus_dict + and lang_code in queries_dict + and lang_code in relevant_docs_dict + ): + corpus_dataset_dict[lang_code] = DatasetDict(corpus_dict[lang_code]) + queries_dataset_dict[lang_code] = DatasetDict(queries_dict[lang_code]) + relevant_docs_dataset_dict[lang_code] = relevant_docs_dict[lang_code] + + return corpus_dataset_dict, queries_dataset_dict, relevant_docs_dataset_dict + + +class VDRMultilingualRetrieval(MultilingualTask, AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="VDRMultilingualRetrieval", + description="Multilingual Visual Document retrieval Dataset covering 5 languages: Italian, Spanish, English, French and German", + reference="https://huggingface.co/datasets/llamaindex/vdr-multilingual-test", + dataset={ + "path": "llamaindex/vdr-multilingual-test", + "revision": "9e26ae152f5950ab1a5ff1c58edade3acc894793", + }, + type="Retrieval", + category="it2it", + modalities=["text", "image"], + eval_splits=[_EVAL_SPLIT], + eval_langs=_LANGS, + main_score="ndcg_at_5", + date=( + "2025-01-01", + "2025-01-10", + ), # Not Specified exactly in the dataset and blog + domains=["Web"], + task_subtypes=["Image Text Retrieval"], + license="apache-2.0", + annotations_creators="LM-generated", + dialect=[], + sample_creation="found", + bibtex_citation="""@misc{llamaindex2024vdrmultilingual, + title={Visual Document Retrieval Goes Multilingual}, + author={LlamaIndex}, + year={2025}, + howpublished={https://huggingface.co/datasets/llamaindex/vdr-multilingual-test}, +}""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs = _load_vdr_multilingual_data( + path=self.metadata_dict["dataset"]["path"], + langs=self.hf_subsets, + split=self.metadata_dict["eval_splits"][0], + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata_dict["dataset"].get("revision", None), + trust_remote_code=self.metadata_dict["dataset"].get( + "trust_remote_code", False + ), + ) + + self.data_loaded = True diff --git a/mteb/tasks/Image/Any2TextMultipleChoice/__init__.py b/mteb/tasks/Image/Any2TextMultipleChoice/__init__.py deleted file mode 100644 index 902907b213..0000000000 --- a/mteb/tasks/Image/Any2TextMultipleChoice/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from __future__ import annotations - -from .eng import CVBenchCount, CVBenchDepth, CVBenchDistance, CVBenchRelation - -__all__ = ["CVBenchCount", "CVBenchDepth", "CVBenchDistance", "CVBenchRelation"] diff --git a/mteb/tasks/Image/Any2TextMultipleChoice/eng/__init__.py b/mteb/tasks/Image/Any2TextMultipleChoice/eng/__init__.py deleted file mode 100644 index 450022a99d..0000000000 --- a/mteb/tasks/Image/Any2TextMultipleChoice/eng/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from __future__ import annotations - -from .CVBench import CVBenchCount, CVBenchDepth, CVBenchDistance, CVBenchRelation - -__all__ = ["CVBenchCount", "CVBenchDepth", "CVBenchDistance", "CVBenchRelation"] diff --git a/mteb/tasks/Image/ImageTextPairClassification/ImageCoDe.py b/mteb/tasks/Image/ImageTextPairClassification/ImageCoDe.py new file mode 100644 index 0000000000..c957e83411 --- /dev/null +++ b/mteb/tasks/Image/ImageTextPairClassification/ImageCoDe.py @@ -0,0 +1,125 @@ +from __future__ import annotations + +import numpy as np +from datasets import DatasetDict, load_dataset + +from mteb.abstasks.Image.AbsTaskImageTextPairClassification import ( + AbsTaskImageTextPairClassification, +) +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class ImageCoDe(AbsTaskImageTextPairClassification): + texts_column_names = ["text"] + images_column_names = [ + "correct_image", + "false_image_1", + "false_image_2", + "false_image_3", + "false_image_4", + "false_image_5", + "false_image_6", + "false_image_7", + "false_image_8", + "false_image_9", + ] + + metadata = TaskMetadata( + name="ImageCoDe", + description="Identify the correct image from a set of similar images based on a precise caption.", + reference="https://aclanthology.org/2022.acl-long.241.pdf", + dataset={ + "path": "JamieSJS/imagecode-multi", + "revision": "d28adfd8b34fefa546fdf94bdc352622b2575f6c", + }, + type="Compositionality", + category="it2i", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="image_acc", + date=("2022-05-22", "2022-05-27"), # conference dates + domains=["Web", "Written"], + task_subtypes=["Image Text Retrieval"], + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + modalities=["text", "image"], + sample_creation="found", + bibtex_citation="""@article{krojer2022image, + title={Image retrieval from contextual descriptions}, + author={Krojer, Benno and Adlakha, Vaibhav and Vineet, Vibhav and Goyal, Yash and Ponti, Edoardo and Reddy, Siva}, + journal={arXiv preprint arXiv:2203.15867}, + year={2022} +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + corpus = load_dataset( + self.metadata.dataset["path"], + "corpus", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + )["corpus"] + query = load_dataset( + self.metadata.dataset["path"], + "query", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + )["test"] + qrels = load_dataset( + self.metadata.dataset["path"], + "qrels", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + )["test"] + + corpus_ids = corpus["id"] + corpus_id_to_idx = {cid: idx for idx, cid in enumerate(corpus_ids)} + + def build_mappings(qrels): + correct_answers = {} + candidate_pools = {} + for row in qrels: + qid = row["query-id"] + cid = row["corpus-id"] + if qid not in candidate_pools: + candidate_pools[qid] = [] + candidate_pools[qid].append(cid) + if row["score"] == 1: + correct_answers[qid] = cid + return correct_answers, candidate_pools + + correct_answers, candidate_pools = build_mappings(qrels) + + def process_example(example): + qid = example["id"] + correct_id = correct_answers[qid] + candidates = candidate_pools[qid] + + candidate_indices = [corpus_id_to_idx[cid] for cid in candidates] + + correct_mask = np.array([cid == correct_id for cid in candidates]) + correct_idx = candidate_indices[np.argmax(correct_mask)] + incorrect_indices = [ + idx for idx, mask in zip(candidate_indices, ~correct_mask) if mask + ] + return { + "text": example["text"], + "correct_image": corpus[correct_idx]["image"], + **{ + f"false_image_{i + 1}": corpus[idx]["image"] + for i, idx in enumerate(incorrect_indices[:9]) + }, + } + + dataset = query.map( + process_example, num_proc=4, remove_columns=query.column_names + ) + + self.dataset = DatasetDict({"test": dataset}) + self.dataset_transform() + self.data_loaded = True diff --git a/mteb/tasks/Image/ImageTextPairClassification/__init__.py b/mteb/tasks/Image/ImageTextPairClassification/__init__.py index ddf8bb9dd4..c548464d0c 100644 --- a/mteb/tasks/Image/ImageTextPairClassification/__init__.py +++ b/mteb/tasks/Image/ImageTextPairClassification/__init__.py @@ -4,11 +4,13 @@ from .AROFlickrOrder import AROFlickrOrder from .AROVisualAttribution import AROVisualAttribution from .AROVisualRelation import AROVisualRelation +from .ImageCoDe import ImageCoDe from .SugarCrepe import SugarCrepe from .Winoground import Winoground __all__ = [ "Winoground", + "ImageCoDe", "AROFlickrOrder", "AROVisualRelation", "SugarCrepe", diff --git a/mteb/tasks/Image/__init__.py b/mteb/tasks/Image/__init__.py index eed836d54d..590aea6648 100644 --- a/mteb/tasks/Image/__init__.py +++ b/mteb/tasks/Image/__init__.py @@ -3,13 +3,10 @@ from .Any2AnyMultiChoice import ( BLINKIT2IMultiChoice, BLINKIT2TMultiChoice, - ImageCoDeT2IMultiChoice, - ROxfordEasyI2IMultiChoice, - ROxfordHardI2IMultiChoice, - ROxfordMediumI2IMultiChoice, - RParisEasyI2IMultiChoice, - RParisHardI2IMultiChoice, - RParisMediumI2IMultiChoice, + CVBenchCount, + CVBenchDepth, + CVBenchDistance, + CVBenchRelation, ) from .Any2AnyRetrieval import ( CUB200I2I, @@ -75,12 +72,6 @@ XFlickr30kCoT2IRetrieval, XM3600T2IRetrieval, ) -from .Any2TextMultipleChoice import ( - CVBenchCount, - CVBenchDepth, - CVBenchDistance, - CVBenchRelation, -) from .Clustering import ( CIFAR10Clustering, CIFAR100Clustering, @@ -117,6 +108,7 @@ AROFlickrOrder, AROVisualAttribution, AROVisualRelation, + ImageCoDe, SugarCrepe, Winoground, ) @@ -191,25 +183,19 @@ "BirdsnapClassification", "RESISC45Classification", "FGVCAircraftClassification", - "ImageCoDeT2IMultiChoice", + "CVBenchCount", + "CVBenchDepth", + "CVBenchDistance", + "CVBenchRelation", "BLINKIT2IMultiChoice", "BLINKIT2TMultiChoice", - "ROxfordEasyI2IMultiChoice", - "ROxfordHardI2IMultiChoice", - "ROxfordMediumI2IMultiChoice", - "RParisEasyI2IMultiChoice", - "RParisHardI2IMultiChoice", - "RParisMediumI2IMultiChoice", "Winoground", + "ImageCoDe", "AROFlickrOrder", "AROVisualRelation", "SugarCrepe", "AROVisualAttribution", "AROCocoOrder", - "CVBenchCount", - "CVBenchDepth", - "CVBenchDistance", - "CVBenchRelation", "MemotionI2TRetrieval", "BLINKIT2TRetrieval", "InfoSeekIT2ITRetrieval", diff --git a/mteb/tasks/Retrieval/multilingual/MIRACLRetrieval.py b/mteb/tasks/Retrieval/multilingual/MIRACLRetrieval.py index 063f2e6a45..59b92e81f6 100644 --- a/mteb/tasks/Retrieval/multilingual/MIRACLRetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/MIRACLRetrieval.py @@ -304,6 +304,7 @@ class MIRACLRetrievalHardNegatives(AbsTaskRetrieval): dataset={ "path": "mteb/miracl-hard-negatives", "revision": "95c8db7d4a6e9c1d8a60601afd63d553ae20a2eb", + "trust_remote_code": True, }, type="Retrieval", category="t2t", @@ -346,7 +347,7 @@ def load_data(self, **kwargs): splits=self.metadata.eval_splits, cache_dir=kwargs.get("cache_dir", None), revision=self.metadata.dataset["revision"], - trust_remote_code=self.metadata.dataset["trust_remote_code"], + trust_remote_code=self.metadata.dataset.get("trust_remote_code", False), ) ) diff --git a/mteb/tasks/__init__.py b/mteb/tasks/__init__.py index d09cc73a64..90fc8ab49e 100644 --- a/mteb/tasks/__init__.py +++ b/mteb/tasks/__init__.py @@ -493,7 +493,7 @@ GTSRBZeroShotClassification, HatefulMemesI2TRetrieval, HatefulMemesT2IRetrieval, - ImageCoDeT2IMultiChoice, + ImageCoDe, ImageCoDeT2IRetrieval, Imagenet1kClassification, Imagenet1kZeroShotClassification, @@ -522,18 +522,12 @@ RenderedSST2, RESISC45Classification, RESISC45ZeroShotClassification, - ROxfordEasyI2IMultiChoice, ROxfordEasyI2IRetrieval, - ROxfordHardI2IMultiChoice, ROxfordHardI2IRetrieval, - ROxfordMediumI2IMultiChoice, ROxfordMediumI2IRetrieval, RP2kI2IRetrieval, - RParisEasyI2IMultiChoice, RParisEasyI2IRetrieval, - RParisHardI2IMultiChoice, RParisHardI2IRetrieval, - RParisMediumI2IMultiChoice, RParisMediumI2IRetrieval, SciMMIR, SciMMIRI2TRetrieval, @@ -1016,25 +1010,19 @@ "BirdsnapClassification", "RESISC45Classification", "FGVCAircraftClassification", - "ImageCoDeT2IMultiChoice", + "CVBenchCount", + "CVBenchDepth", + "CVBenchDistance", + "CVBenchRelation", "BLINKIT2IMultiChoice", "BLINKIT2TMultiChoice", - "ROxfordEasyI2IMultiChoice", - "ROxfordHardI2IMultiChoice", - "ROxfordMediumI2IMultiChoice", - "RParisEasyI2IMultiChoice", - "RParisHardI2IMultiChoice", - "RParisMediumI2IMultiChoice", "Winoground", + "ImageCoDe", "AROFlickrOrder", "AROVisualRelation", "SugarCrepe", "AROVisualAttribution", "AROCocoOrder", - "CVBenchCount", - "CVBenchDepth", - "CVBenchDistance", - "CVBenchRelation", "MemotionI2TRetrieval", "BLINKIT2TRetrieval", "InfoSeekIT2ITRetrieval", diff --git a/pyproject.toml b/pyproject.toml index d018f60b85..89176c2ca1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.36.30" +version = "1.36.35" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ @@ -70,10 +70,8 @@ docs = [ "mkdocstrings[python]>=0.18", "mkdocs-bibtex>=2.16.2", ] - speedtask = ["GPUtil>=1.4.0", "psutil>=5.9.8"] peft = ["peft>=0.11.0"] - leaderboard = [ "gradio==5.16.0; python_version > '3.9'", # 3.10 is required for gradio "gradio_rangeslider>=0.0.8", @@ -81,7 +79,7 @@ leaderboard = [ "cachetools>=5.2.0", "matplotlib>=3.9.4", ] -flagembedding = ["FlagEmbedding"] +flagembedding = ["FlagEmbedding==1.3.4"] jina = ["einops>=0.8.0"] flash_attention = ["flash-attn>=2.6.3"] openai = ["openai>=1.41.0", "tiktoken>=0.8.0"] @@ -91,7 +89,13 @@ bm25s = ["bm25s>=0.2.6", "PyStemmer>=2.2.0.3"] gritlm = ["gritlm>=1.0.2"] xformers = ["xformers>=0.0.29"] blip2 = ["salesforce-lavis>=1.0.2"] - +voyageai = ["voyageai>=1.0.0,<2.0.0"] +voyage_v = ["voyageai>1.0.0,<2.0.0", "tenacity>1.0.0,<2.0.0"] +cohere = ["cohere==5.14.0"] +vertexai = ["vertexai==1.71.1"] +ll2vec = ["ll2vec==0.2.3"] +timm = ["timm==1.0.15"] +open_clip_torch = ["open_clip_torch==2.31.0"] [tool.coverage.report] diff --git a/scripts/make_leaderboard.py b/scripts/make_leaderboard.py index fed85e383f..4e322b3210 100644 --- a/scripts/make_leaderboard.py +++ b/scripts/make_leaderboard.py @@ -7,7 +7,7 @@ import pandas as pd import mteb -from mteb.leaderboard.table import scores_to_tables +from mteb.leaderboard.table import create_tables from mteb.load_results import load_results logging.basicConfig(level=logging.INFO) @@ -64,7 +64,7 @@ def load_leaderboard( scores_long = benchmark_results_filtered.get_scores(format="long") # Convert scores into leaderboard tables - summary_gr_df, per_task_gr_df = scores_to_tables(scores_long=scores_long) + summary_gr_df, per_task_gr_df = create_tables(scores_long=scores_long) # Convert Gradio DataFrames to Pandas summary_df = pd.DataFrame( diff --git a/scripts/run_mieb.py b/scripts/run_mieb.py index a0d444617e..ea77d2f635 100644 --- a/scripts/run_mieb.py +++ b/scripts/run_mieb.py @@ -60,7 +60,7 @@ task_types=[ "Any2AnyRetrieval", "Any2AnyMultiChoice", - "VisionCentric", + "VisionCentricQA", "ImageClustering", "ImageClassification", "ImageMultilabelClassification", diff --git a/scripts/run_mieb_rerun_siglip.py b/scripts/run_mieb_rerun_siglip.py index 5c7bad9f27..f6849468c7 100644 --- a/scripts/run_mieb_rerun_siglip.py +++ b/scripts/run_mieb_rerun_siglip.py @@ -17,7 +17,7 @@ task_types=[ "Any2AnyRetrieval", "Any2AnyMultiChoice", - "VisionCentric", + "VisionCentricQA", "ImageClustering", "ImageClassification", "ImageMultilabelClassification", diff --git a/tests/test_benchmark/mock_models.py b/tests/test_benchmark/mock_models.py index 7acb033083..8df791d3a2 100644 --- a/tests/test_benchmark/mock_models.py +++ b/tests/test_benchmark/mock_models.py @@ -52,7 +52,8 @@ class MockSentenceTransformersbf16Encoder(SentenceTransformer): """Ensure that data types not supported by the encoder are converted to the supported data type.""" model_card_data = Namespace( - model_name="MockSentenceTransformersbf16Encoder", base_model_revision="1.0.0" + model_name="mock/MockSentenceTransformersbf16Encoder", + base_model_revision="1.0.0", ) def __init__(self): @@ -71,7 +72,7 @@ def get_sentence_embedding_dimension() -> int: class MockCLIPEncoder: mteb_model_meta = ModelMeta( loader=None, - name="MockCLIPModel", + name="mock/MockCLIPModel", languages=["eng_Latn"], revision="3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268", release_date="2021-02-06", @@ -118,7 +119,7 @@ def calculate_probs(self, text_embeddings, image_embeddings): class MockMocoEncoder: mteb_model_meta = ModelMeta( loader=None, - name="MockMocoModel", + name="mock/MockMocoModel", languages=["eng_Latn"], revision="7d091cd70772c5c0ecf7f00b5f12ca609a99d69d", release_date="2024-01-01", diff --git a/tests/test_benchmark/mock_tasks.py b/tests/test_benchmark/mock_tasks.py index ac6836863c..43ff793282 100644 --- a/tests/test_benchmark/mock_tasks.py +++ b/tests/test_benchmark/mock_tasks.py @@ -19,9 +19,6 @@ from mteb.abstasks.AbsTaskSummarization import AbsTaskSummarization from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import AbsTaskAny2AnyMultiChoice from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval -from mteb.abstasks.Image.AbsTaskAny2TextMultipleChoice import ( - AbsTaskAny2TextMultipleChoice, -) from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification from mteb.abstasks.Image.AbsTaskImageClustering import AbsTaskImageClustering from mteb.abstasks.Image.AbsTaskImageMultilabelClassification import ( # noqa @@ -2601,46 +2598,6 @@ def load_data(self, **kwargs): self.data_loaded = True -class MockTextMultipleChoiceTask(AbsTaskAny2TextMultipleChoice): - expected_stats = { - "test": { - # TODO: Add descriptive stats - } - } - - metadata = TaskMetadata( - type="VisionCentric", - name="MockTextMultipleChoice", - main_score="accuracy", - **general_args, # type: ignore - ) - metadata.modalities = ["text", "image"] - metadata.category = "it2i" - - def load_data(self, **kwargs): - images = [np.random.randint(0, 255, (100, 100, 3)) for _ in range(2)] # noqa: NPY002 - images = [ - Image.fromarray(image.astype("uint8")).convert("RGBA") for image in images - ] - - self.dataset = DatasetDict( - { - "test": Dataset.from_dict( - { - "id": [f"q{i}" for i in range(2)], - "image": [images[i] for i in range(2)], - "question": [ - "This is a positive sentence", - "This is another positive sentence", - ], - "choices": [["3", "2", "1", "0"], ["3", "2", "1", "0"]], - "answer": ["1", "0"], - } - ) - } - ) - - class MockImageClassificationTask(AbsTaskImageClassification): expected_stats = { "test": { diff --git a/tests/test_benchmark/task_grid.py b/tests/test_benchmark/task_grid.py index 9693468697..3105a280ef 100644 --- a/tests/test_benchmark/task_grid.py +++ b/tests/test_benchmark/task_grid.py @@ -43,7 +43,6 @@ MockRetrievalTask, MockSTSTask, MockSummarizationTask, - MockTextMultipleChoiceTask, MockVisualSTSTask, MockZeroShotClassificationTask, ) @@ -112,7 +111,6 @@ MOCK_MIEB_TASK_GRID = [ MockAny2AnyRetrievalI2TTask(), MockAny2AnyRetrievalT2ITask(), - MockTextMultipleChoiceTask(), MockMultiChoiceTask(), MockImageClassificationTask(), MockImageClassificationKNNTask(), diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py index 6df4888f3d..aff9398e7d 100644 --- a/tests/test_tasks/test_all_abstasks.py +++ b/tests/test_tasks/test_all_abstasks.py @@ -13,6 +13,9 @@ from mteb.abstasks.aggregated_task import AbsTaskAggregate from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import AbsTaskAny2AnyMultiChoice from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.Image.AbsTaskImageTextPairClassification import ( + AbsTaskImageTextPairClassification, +) from mteb.overview import TASKS_REGISTRY, get_tasks from ..test_benchmark.task_grid import ( @@ -76,6 +79,7 @@ def test_load_data( or isinstance(task, AbsTaskAny2AnyRetrieval) or isinstance(task, AbsTaskSpeedTask) or isinstance(task, AbsTaskAny2AnyMultiChoice) + or isinstance(task, AbsTaskImageTextPairClassification) or task.metadata.is_multilingual ): pytest.skip() diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py index 8a47a235c7..f127bc7e57 100644 --- a/tests/test_tasks/test_mteb_rerank.py +++ b/tests/test_tasks/test_mteb_rerank.py @@ -405,7 +405,7 @@ def test_reranker_same_ndcg1(tmp_path: Path): # read in stage 1 and stage two and check ndcg@1 is the same with open( - f"tests/results/stage1/{de_name.replace('/', '__')}/{revision}/SciFact.json" + f"{stage1_path}/{de_name.replace('/', '__')}/{revision}/SciFact.json" ) as f: stage1 = json.load(f)