From 53756ad59d48c8fede1bd4a85a9ad3f1ba948cbb Mon Sep 17 00:00:00 2001 From: Daniel Buades Marcos Date: Tue, 10 Dec 2024 20:32:13 +0100 Subject: [PATCH 01/31] feat: add new arctic v2.0 models (#1574) * feat: add new arctic v2.0 models * chore: make lint --- mteb/models/arctic_models.py | 187 +++++++++++++++++++++++++++++------ mteb/models/misc_models.py | 2 + 2 files changed, 157 insertions(+), 32 deletions(-) diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py index 9ac70fd638..eadc4065fe 100644 --- a/mteb/models/arctic_models.py +++ b/mteb/models/arctic_models.py @@ -4,33 +4,82 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader -arctic_m_v1_5 = ModelMeta( - loader=partial( - sentence_transformers_loader, - model_name="Snowflake/snowflake-arctic-embed-m-v1.5", - revision="97eab2e17fcb7ccb8bb94d6e547898fa1a6a0f47", - model_prompts={ - "query": "Represent this sentence for searching relevant passages: " - }, - ), - name="Snowflake/snowflake-arctic-embed-m-v1.5", - revision="97eab2e17fcb7ccb8bb94d6e547898fa1a6a0f47", - release_date="2024-07-08", # initial commit of hf model. - languages=["eng_Latn"], - open_weights=True, - framework=["Sentence Transformers", "PyTorch"], - n_parameters=109_000_000, - memory_usage=None, - max_tokens=512, - embed_dim=768, - license="apache-2.0", - reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5", - similarity_fn_name="cosine", - use_instructions=False, - adapted_from=None, - superseded_by=None, -) - +LANGUAGES_V2_0 = [ + "afr_Latn", + "ara_Arab", + "aze_Latn", + "bel_Cyrl", + "bul_Cyrl", + "ben_Beng", + "cat_Latn", + "ceb_Latn", + "ces_Latn", + "cym_Latn", + "dan_Latn", + "deu_Latn", + "ell_Grek", + "eng_Latn", + "spa_Latn", + "est_Latn", + "eus_Latn", + "fas_Arab", + "fin_Latn", + "fra_Latn", + "glg_Latn", + "guj_Gujr", + "heb_Hebr", + "hin_Deva", + "hrv_Latn", + "hat_Latn", + "hun_Latn", + "hye_Armn", + "ind_Latn", + "isl_Latn", + "ita_Latn", + "jpn_Jpan", + "jav_Latn", + "kat_Geor", + "kaz_Cyrl", + "khm_Khmr", + "kan_Knda", + "kor_Hang", + "kir_Cyrl", + "lao_Laoo", + "lit_Latn", + "lav_Latn", + "mkd_Cyrl", + "mal_Mlym", + "mon_Cyrl", + "mar_Deva", + "msa_Latn", + "mya_Mymr", + "nep_Deva", + "nld_Latn", + "pan_Guru", + "pol_Latn", + "por_Latn", + "que_Latn", + "ron_Latn", + "rus_Cyrl", + "sin_Sinh", + "slk_Latn", + "slv_Latn", + "som_Latn", + "sqi_Latn", + "srp_Cyrl", + "swe_Latn", + "swa_Latn", + "tam_Taml", + "tel_Telu", + "tha_Thai", + "tgl_Latn", + "tur_Latn", + "ukr_Cyrl", + "urd_Arab", + "vie_Latn", + "yor_Latn", + "zho_Hans", +] arctic_embed_xs = ModelMeta( loader=partial( @@ -118,7 +167,7 @@ languages=["eng_Latn"], open_weights=True, framework=["Sentence Transformers", "PyTorch"], - n_parameters=109_000_000, + n_parameters=137_000_000, memory_usage=None, max_tokens=2048, embed_dim=768, @@ -127,10 +176,9 @@ similarity_fn_name="cosine", use_instructions=False, adapted_from="nomic-ai/nomic-embed-text-v1-unsupervised", - superseded_by=None, + superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", ) - arctic_embed_l = ModelMeta( loader=partial( sentence_transformers_loader, @@ -143,14 +191,89 @@ languages=["eng_Latn"], open_weights=True, framework=["Sentence Transformers", "PyTorch"], - n_parameters=109_000_000, + n_parameters=335_000_000, memory_usage=None, max_tokens=512, - embed_dim=768, + embed_dim=1024, license="apache-2.0", reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-l", similarity_fn_name="cosine", use_instructions=False, adapted_from="intfloat/e5-base-unsupervised", + superseded_by="Snowflake/snowflake-arctic-embed-l-v2.0", +) + +arctic_embed_m_v1_5 = ModelMeta( + loader=partial( + sentence_transformers_loader, + model_name="Snowflake/snowflake-arctic-embed-m-v1.5", + revision="97eab2e17fcb7ccb8bb94d6e547898fa1a6a0f47", + model_prompts={ + "query": "Represent this sentence for searching relevant passages: " + }, + ), + name="Snowflake/snowflake-arctic-embed-m-v1.5", + revision="97eab2e17fcb7ccb8bb94d6e547898fa1a6a0f47", + release_date="2024-07-08", # initial commit of hf model. + languages=["eng_Latn"], + open_weights=True, + framework=["Sentence Transformers", "PyTorch"], + n_parameters=109_000_000, + memory_usage=None, + max_tokens=512, + embed_dim=768, + license="apache-2.0", + reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5", + similarity_fn_name="cosine", + use_instructions=False, + adapted_from=None, + superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", +) + +arctic_embed_m_v2_0 = ModelMeta( + loader=partial( + sentence_transformers_loader, + model_name="Snowflake/snowflake-arctic-embed-m-v2.0", + revision="f2a7d59d80dfda5b1d14f096f3ce88bb6bf9ebdc", + ), + name="Snowflake/snowflake-arctic-embed-m-v2.0", + revision="f2a7d59d80dfda5b1d14f096f3ce88bb6bf9ebdc", + release_date="2024-12-04", # initial commit of hf model. + languages=LANGUAGES_V2_0, + open_weights=True, + framework=["Sentence Transformers", "PyTorch"], + n_parameters=305_000_000, + memory_usage=None, + max_tokens=8192, + embed_dim=768, + license="apache-2.0", + reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v2.0", + similarity_fn_name="cosine", + use_instructions=False, + adapted_from="Alibaba-NLP/gte-multilingual-base", + superseded_by=None, +) + +arctic_embed_l_v2_0 = ModelMeta( + loader=partial( + sentence_transformers_loader, + model_name="Snowflake/snowflake-arctic-embed-l-v2.0", + revision="edc2df7b6c25794b340229ca082e7c78782e6374", + ), + name="Snowflake/snowflake-arctic-embed-l-v2.0", + revision="edc2df7b6c25794b340229ca082e7c78782e6374", + release_date="2024-12-04", # initial commit of hf model. + languages=LANGUAGES_V2_0, + open_weights=True, + framework=["Sentence Transformers", "PyTorch"], + n_parameters=568_000_000, + memory_usage=None, + max_tokens=8192, + embed_dim=1024, + license="apache-2.0", + reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0", + similarity_fn_name="cosine", + use_instructions=False, + adapted_from="BAAI/bge-m3-retromae", superseded_by=None, ) diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index 2429cce39b..61dc549b15 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from mteb.model_meta import ModelMeta Haon_Chen__speed_embedding_7b_instruct = ModelMeta( From 27f7d8cba14de8c80758e49c0e9fad84af7396bc Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 10 Dec 2024 19:47:32 +0000 Subject: [PATCH 02/31] 1.24.0 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 604ebbbea9..956ca50631 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.23.2" +version = "1.24.0" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 7b9b3c98a26506d64808bdfb082e1f853f3f4f71 Mon Sep 17 00:00:00 2001 From: Omar Elshehy <41394057+omarelshehy@users.noreply.github.com> Date: Thu, 12 Dec 2024 00:18:10 +0100 Subject: [PATCH 03/31] fix: Add namaa MrTydi reranking dataset (#1573) * Add dataset class and file requirements * pass tests * make lint changes * adjust meta data and remove load_data --------- Co-authored-by: Omar Elshehy --- mteb/tasks/Reranking/__init__.py | 1 + .../Reranking/ara/NamaaMrTydiReranking.py | 39 +++++++++++++++++++ mteb/tasks/Reranking/ara/__init__.py | 0 3 files changed, 40 insertions(+) create mode 100644 mteb/tasks/Reranking/ara/NamaaMrTydiReranking.py create mode 100644 mteb/tasks/Reranking/ara/__init__.py diff --git a/mteb/tasks/Reranking/__init__.py b/mteb/tasks/Reranking/__init__.py index a4b302a17f..2c3a27919a 100644 --- a/mteb/tasks/Reranking/__init__.py +++ b/mteb/tasks/Reranking/__init__.py @@ -1,5 +1,6 @@ from __future__ import annotations +from .ara.NamaaMrTydiReranking import * from .eng.AskUbuntuDupQuestions import * from .eng.MindSmallReranking import * from .eng.SciDocsReranking import * diff --git a/mteb/tasks/Reranking/ara/NamaaMrTydiReranking.py b/mteb/tasks/Reranking/ara/NamaaMrTydiReranking.py new file mode 100644 index 0000000000..4a9d755747 --- /dev/null +++ b/mteb/tasks/Reranking/ara/NamaaMrTydiReranking.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskReranking import AbsTaskReranking + + +class NamaaMrTydiReranking(AbsTaskReranking): + metadata = TaskMetadata( + name="NamaaMrTydiReranking", + description="Mr. TyDi is a multi-lingual benchmark dataset built on TyDi, covering eleven typologically diverse languages. It is designed for monolingual retrieval, specifically to evaluate ranking with learned dense representations. This dataset adapts the arabic test split for Reranking evaluation purposes by the addition of multiple (Hard) Negatives to each query and positive", + reference="https://huggingface.co/NAMAA-Space", + dataset={ + "path": "NAMAA-Space/mteb-eval-mrtydi", + "revision": "502637220a7ad0ecc5c39ff5518d7508d2624af8", + }, + type="Reranking", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["ara-Arab"], + main_score="map", + date=("2023-11-01", "2024-05-15"), + domains=["Encyclopaedic", "Written"], + task_subtypes=[], + license="cc-by-sa-3.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="""@article{muennighoff2022mteb, + doi = {10.48550/ARXIV.2210.07316}, + url = {https://arxiv.org/abs/2210.07316}, + author = {Muennighoff, Niklas and Tazi, Nouamane and Magne, Lo{\"\i}c and Reimers, Nils}, + title = {MTEB: Massive Text Embedding Benchmark}, + publisher = {arXiv}, + journal={arXiv preprint arXiv:2210.07316}, + year = {2022} +}""", + ) diff --git a/mteb/tasks/Reranking/ara/__init__.py b/mteb/tasks/Reranking/ara/__init__.py new file mode 100644 index 0000000000..e69de29bb2 From 1101db7936cf0199aa587c55b7b34a3cd569e6ce Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 11 Dec 2024 23:20:21 +0000 Subject: [PATCH 04/31] Update tasks table --- docs/tasks.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/tasks.md b/docs/tasks.md index f32f90cf93..70cff09bdd 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -360,6 +360,7 @@ The following tables give you an overview of the tasks in MTEB. | [NTREXBitextMining](https://huggingface.co/datasets/davidstap/NTREX) | ['afr', 'amh', 'arb', 'aze', 'bak', 'bel', 'bem', 'ben', 'bod', 'bos', 'bul', 'cat', 'ces', 'ckb', 'cym', 'dan', 'deu', 'div', 'dzo', 'ell', 'eng', 'eus', 'ewe', 'fao', 'fas', 'fij', 'fil', 'fin', 'fra', 'fuc', 'gle', 'glg', 'guj', 'hau', 'heb', 'hin', 'hmn', 'hrv', 'hun', 'hye', 'ibo', 'ind', 'isl', 'ita', 'jpn', 'kan', 'kat', 'kaz', 'khm', 'kin', 'kir', 'kmr', 'kor', 'lao', 'lav', 'lit', 'ltz', 'mal', 'mar', 'mey', 'mkd', 'mlg', 'mlt', 'mon', 'mri', 'msa', 'mya', 'nde', 'nep', 'nld', 'nno', 'nob', 'nso', 'nya', 'orm', 'pan', 'pol', 'por', 'prs', 'pus', 'ron', 'rus', 'shi', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'spa', 'sqi', 'srp', 'ssw', 'swa', 'swe', 'tah', 'tam', 'tat', 'tel', 'tgk', 'tha', 'tir', 'ton', 'tsn', 'tuk', 'tur', 'uig', 'ukr', 'urd', 'uzb', 'ven', 'vie', 'wol', 'xho', 'yor', 'yue', 'zho', 'zul'] | BitextMining | s2s | [News, Written] | {'test': 3826252} | {'test': {'num_samples': 3826252, 'number_of_characters': 988355274, 'unique_pairs': 3820263, 'min_sentence1_length': 1, 'average_sentence1_length': 129.15, 'max_sentence1_length': 773, 'unique_sentence1': 241259, 'min_sentence2_length': 1, 'average_sentence2_length': 129.15, 'max_sentence2_length': 773, 'unique_sentence2': 241259, 'hf_subset_descriptive_stats': {'afr_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 520490, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'afr_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 564002, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'afr_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 516072, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'afr_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 526155, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'afr_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 530560, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'afr_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 549109, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'afr_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 560267, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'afr_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 516709, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'afr_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 519796, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'afr_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 520179, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'amh_Ethi-eng_Latn': {'num_samples': 1997, 'number_of_characters': 415227, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'amh_Ethi-hau_Latn': {'num_samples': 1997, 'number_of_characters': 437473, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'amh_Ethi-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 413608, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'amh_Ethi-nso_Latn': {'num_samples': 1997, 'number_of_characters': 459006, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'amh_Ethi-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 404938, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'amh_Ethi-som_Latn': {'num_samples': 1997, 'number_of_characters': 458799, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'amh_Ethi-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 455649, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'amh_Ethi-swa_Latn': {'num_samples': 1997, 'number_of_characters': 440016, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'amh_Ethi-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 332745, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'amh_Ethi-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 501790, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'amh_Ethi-wol_Latn': {'num_samples': 1997, 'number_of_characters': 407310, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'amh_Ethi-xho_Latn': {'num_samples': 1997, 'number_of_characters': 435597, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'amh_Ethi-yor_Latn': {'num_samples': 1997, 'number_of_characters': 483595, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'amh_Ethi-zul_Latn': {'num_samples': 1997, 'number_of_characters': 425239, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'arb_Arab-ben_Beng': {'num_samples': 1997, 'number_of_characters': 474983, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'arb_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 483548, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'arb_Arab-deu_Latn': {'num_samples': 1997, 'number_of_characters': 526831, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'arb_Arab-ell_Grek': {'num_samples': 1997, 'number_of_characters': 530308, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'arb_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 478901, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'arb_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 474520, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'arb_Arab-fin_Latn': {'num_samples': 1997, 'number_of_characters': 500981, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'arb_Arab-fra_Latn': {'num_samples': 1997, 'number_of_characters': 524289, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'arb_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 431477, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'arb_Arab-hin_Deva': {'num_samples': 1997, 'number_of_characters': 492756, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'arb_Arab-hun_Latn': {'num_samples': 1997, 'number_of_characters': 509557, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'arb_Arab-ind_Latn': {'num_samples': 1997, 'number_of_characters': 518153, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'arb_Arab-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 342807, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'arb_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 477127, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'arb_Arab-kor_Hang': {'num_samples': 1997, 'number_of_characters': 364586, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'arb_Arab-lit_Latn': {'num_samples': 1997, 'number_of_characters': 490578, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'arb_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 445016, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'arb_Arab-nld_Latn': {'num_samples': 1997, 'number_of_characters': 523096, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'arb_Arab-pol_Latn': {'num_samples': 1997, 'number_of_characters': 509047, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'arb_Arab-por_Latn': {'num_samples': 1997, 'number_of_characters': 508396, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'arb_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 473717, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'arb_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 473814, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'arb_Arab-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 506074, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'arb_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 446094, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'arb_Arab-spa_Latn': {'num_samples': 1997, 'number_of_characters': 519381, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'arb_Arab-swa_Latn': {'num_samples': 1997, 'number_of_characters': 503690, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'arb_Arab-swe_Latn': {'num_samples': 1997, 'number_of_characters': 483008, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'arb_Arab-tam_Taml': {'num_samples': 1997, 'number_of_characters': 541142, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'arb_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 505328, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'arb_Arab-tur_Latn': {'num_samples': 1997, 'number_of_characters': 496794, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'arb_Arab-vie_Latn': {'num_samples': 1997, 'number_of_characters': 502302, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'arb_Arab-zho_Hant': {'num_samples': 1997, 'number_of_characters': 322659, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'arb_Arab-zul_Latn': {'num_samples': 1997, 'number_of_characters': 488913, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'aze_Latn-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 515960, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'aze_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 517354, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'aze_Latn-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 529910, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'aze_Latn-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 520498, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'aze_Latn-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 515560, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'aze_Latn-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 554908, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'aze_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 535247, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'aze_Latn-uig_Arab': {'num_samples': 1997, 'number_of_characters': 580656, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'aze_Latn-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 563329, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'bak_Cyrl-aze_Latn': {'num_samples': 1997, 'number_of_characters': 515960, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'bak_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 494046, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bak_Cyrl-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 506602, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'bak_Cyrl-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 497190, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'bak_Cyrl-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 492252, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'bak_Cyrl-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 531600, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'bak_Cyrl-tur_Latn': {'num_samples': 1997, 'number_of_characters': 511939, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'bak_Cyrl-uig_Arab': {'num_samples': 1997, 'number_of_characters': 557348, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'bak_Cyrl-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 540021, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'bel_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 511000, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'bel_Cyrl-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 525979, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'bel_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 497408, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'bel_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 503810, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bel_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 512015, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'bel_Cyrl-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 523981, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'bel_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 533956, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'bel_Cyrl-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 530983, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'bel_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 509059, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'bel_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 508986, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'bel_Cyrl-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 508393, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'bel_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 512231, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'bel_Cyrl-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 518873, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'bem_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 546212, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bem_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 537470, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'bem_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 526972, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'bem_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 602279, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'bem_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 596231, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'bem_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 582774, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'bem_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 596822, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'bem_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 598248, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'ben_Beng-arb_Arab': {'num_samples': 1997, 'number_of_characters': 474983, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'ben_Beng-deu_Latn': {'num_samples': 1997, 'number_of_characters': 539452, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'ben_Beng-div_Thaa': {'num_samples': 1997, 'number_of_characters': 547650, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'ben_Beng-ell_Grek': {'num_samples': 1997, 'number_of_characters': 542929, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'ben_Beng-eng_Latn': {'num_samples': 1997, 'number_of_characters': 491522, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ben_Beng-eus_Latn': {'num_samples': 1997, 'number_of_characters': 519005, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'ben_Beng-fas_Arab': {'num_samples': 1997, 'number_of_characters': 487141, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'ben_Beng-fin_Latn': {'num_samples': 1997, 'number_of_characters': 513602, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'ben_Beng-fra_Latn': {'num_samples': 1997, 'number_of_characters': 536910, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'ben_Beng-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 488733, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'ben_Beng-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 444098, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'ben_Beng-hin_Deva': {'num_samples': 1997, 'number_of_characters': 505377, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'ben_Beng-hun_Latn': {'num_samples': 1997, 'number_of_characters': 522178, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'ben_Beng-ind_Latn': {'num_samples': 1997, 'number_of_characters': 530774, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'ben_Beng-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 355428, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'ben_Beng-kan_Knda': {'num_samples': 1997, 'number_of_characters': 509338, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'ben_Beng-kor_Hang': {'num_samples': 1997, 'number_of_characters': 377207, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'ben_Beng-lit_Latn': {'num_samples': 1997, 'number_of_characters': 503199, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'ben_Beng-mar_Deva': {'num_samples': 1997, 'number_of_characters': 504689, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'ben_Beng-nep_Deva': {'num_samples': 1997, 'number_of_characters': 492025, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'ben_Beng-nld_Latn': {'num_samples': 1997, 'number_of_characters': 535717, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'ben_Beng-pan_Guru': {'num_samples': 1997, 'number_of_characters': 494224, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'ben_Beng-pol_Latn': {'num_samples': 1997, 'number_of_characters': 521668, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ben_Beng-por_Latn': {'num_samples': 1997, 'number_of_characters': 521017, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'ben_Beng-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 518695, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'ben_Beng-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 502543, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'ben_Beng-snd_Arab': {'num_samples': 1997, 'number_of_characters': 464129, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'ben_Beng-spa_Latn': {'num_samples': 1997, 'number_of_characters': 532002, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'ben_Beng-swa_Latn': {'num_samples': 1997, 'number_of_characters': 516311, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'ben_Beng-swe_Latn': {'num_samples': 1997, 'number_of_characters': 495629, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'ben_Beng-tam_Taml': {'num_samples': 1997, 'number_of_characters': 553763, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'ben_Beng-tel_Telu': {'num_samples': 1997, 'number_of_characters': 491329, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'ben_Beng-tur_Latn': {'num_samples': 1997, 'number_of_characters': 509415, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'ben_Beng-urd_Arab': {'num_samples': 1997, 'number_of_characters': 491800, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'ben_Beng-vie_Latn': {'num_samples': 1997, 'number_of_characters': 514923, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'ben_Beng-zho_Hant': {'num_samples': 1997, 'number_of_characters': 335280, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'ben_Beng-zul_Latn': {'num_samples': 1997, 'number_of_characters': 501534, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'bod_Tibt-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 543850, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'bod_Tibt-eng_Latn': {'num_samples': 1997, 'number_of_characters': 548349, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bod_Tibt-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 589120, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'bod_Tibt-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 567609, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'bod_Tibt-mon_Mong': {'num_samples': 1997, 'number_of_characters': 559677, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'bod_Tibt-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 612483, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'bod_Tibt-tha_Thai': {'num_samples': 1997, 'number_of_characters': 538097, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'bos_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 511000, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'bos_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 524799, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'bos_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 496228, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'bos_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 502630, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bos_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 510835, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'bos_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 522801, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'bos_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 532776, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'bos_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 529803, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'bos_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 507879, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'bos_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 507806, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'bos_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 507213, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'bos_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 511051, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'bos_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 517693, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'bul_Cyrl-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 525979, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'bul_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 524799, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'bul_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 511207, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'bul_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 517609, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bul_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 525814, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'bul_Cyrl-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 537780, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'bul_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 547755, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'bul_Cyrl-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 544782, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'bul_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 522858, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'bul_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 522785, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'bul_Cyrl-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 522192, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'bul_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 526030, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'bul_Cyrl-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 532672, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'cat_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 530680, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'cat_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 576068, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'cat_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 554946, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'cat_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 572177, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'cat_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 560435, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'cat_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 560175, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'cat_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 575445, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'cat_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 571160, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'ces_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 497408, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'ces_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 496228, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'ces_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 511207, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'ces_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 489038, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ces_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 497243, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'ces_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 509209, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'ces_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 519184, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ces_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 516211, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'ces_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 494287, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'ces_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 494214, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'ces_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 493621, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'ces_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 497459, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'ces_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 504101, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'ckb_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 483548, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'ckb_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 500087, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ckb_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 495706, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'ckb_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 452663, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'ckb_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 498313, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'ckb_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 466202, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'ckb_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 494903, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'ckb_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 495000, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'ckb_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 467280, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'ckb_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 526514, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'cym_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 514225, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.45, 'max_sentence1_length': 444, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'cym_Latn-gle_Latn': {'num_samples': 1997, 'number_of_characters': 561314, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.45, 'max_sentence1_length': 444, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 147.63, 'max_sentence2_length': 461, 'unique_sentence2': 1997}, 'dan_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 520490, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'dan_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 547788, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'dan_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 499858, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'dan_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 509941, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'dan_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 514346, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'dan_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 532895, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'dan_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 544053, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'dan_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 500495, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'dan_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 503582, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'dan_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 503965, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'deu_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 564002, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'deu_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 526831, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'deu_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 539452, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'deu_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 547788, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'deu_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 594777, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'deu_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 543370, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'deu_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 553453, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'deu_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 538989, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'deu_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 565450, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'deu_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 588758, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'deu_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 495946, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'deu_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 557225, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'deu_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 574026, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'deu_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 582622, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'deu_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 557858, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'deu_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 407276, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'deu_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 429055, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'deu_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 555047, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'deu_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 576407, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'deu_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 587565, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'deu_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 544007, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'deu_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 547094, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'deu_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 573516, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'deu_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 572865, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'deu_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 570543, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'deu_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 583850, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'deu_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 568159, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'deu_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 547477, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'deu_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 605611, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'deu_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 561263, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'deu_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 566771, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'deu_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 387128, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'deu_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 553382, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'div_Thaa-ben_Beng': {'num_samples': 1997, 'number_of_characters': 547650, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'div_Thaa-eng_Latn': {'num_samples': 1997, 'number_of_characters': 551568, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'div_Thaa-eus_Latn': {'num_samples': 1997, 'number_of_characters': 579051, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'div_Thaa-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 548779, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'div_Thaa-hin_Deva': {'num_samples': 1997, 'number_of_characters': 565423, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'div_Thaa-kan_Knda': {'num_samples': 1997, 'number_of_characters': 569384, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'div_Thaa-mar_Deva': {'num_samples': 1997, 'number_of_characters': 564735, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'div_Thaa-nep_Deva': {'num_samples': 1997, 'number_of_characters': 552071, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'div_Thaa-pan_Guru': {'num_samples': 1997, 'number_of_characters': 554270, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'div_Thaa-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 562589, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'div_Thaa-snd_Arab': {'num_samples': 1997, 'number_of_characters': 524175, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'div_Thaa-tam_Taml': {'num_samples': 1997, 'number_of_characters': 613809, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'div_Thaa-tel_Telu': {'num_samples': 1997, 'number_of_characters': 551375, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'div_Thaa-urd_Arab': {'num_samples': 1997, 'number_of_characters': 551846, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'dzo_Tibt-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 543850, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'dzo_Tibt-eng_Latn': {'num_samples': 1997, 'number_of_characters': 490941, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'dzo_Tibt-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 531712, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'dzo_Tibt-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 510201, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'dzo_Tibt-mon_Mong': {'num_samples': 1997, 'number_of_characters': 502269, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'dzo_Tibt-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 555075, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'dzo_Tibt-tha_Thai': {'num_samples': 1997, 'number_of_characters': 480689, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'ell_Grek-arb_Arab': {'num_samples': 1997, 'number_of_characters': 530308, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'ell_Grek-ben_Beng': {'num_samples': 1997, 'number_of_characters': 542929, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'ell_Grek-deu_Latn': {'num_samples': 1997, 'number_of_characters': 594777, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'ell_Grek-eng_Latn': {'num_samples': 1997, 'number_of_characters': 546847, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ell_Grek-fas_Arab': {'num_samples': 1997, 'number_of_characters': 542466, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'ell_Grek-fin_Latn': {'num_samples': 1997, 'number_of_characters': 568927, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'ell_Grek-fra_Latn': {'num_samples': 1997, 'number_of_characters': 592235, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'ell_Grek-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 499423, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'ell_Grek-hin_Deva': {'num_samples': 1997, 'number_of_characters': 560702, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'ell_Grek-hun_Latn': {'num_samples': 1997, 'number_of_characters': 577503, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'ell_Grek-hye_Armn': {'num_samples': 1997, 'number_of_characters': 563842, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 132.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'ell_Grek-ind_Latn': {'num_samples': 1997, 'number_of_characters': 586099, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'ell_Grek-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 410753, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'ell_Grek-kat_Geor': {'num_samples': 1997, 'number_of_characters': 565719, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 133.5, 'max_sentence2_length': 503, 'unique_sentence2': 1995}, 'ell_Grek-kor_Hang': {'num_samples': 1997, 'number_of_characters': 432532, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'ell_Grek-lit_Latn': {'num_samples': 1997, 'number_of_characters': 558524, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'ell_Grek-nld_Latn': {'num_samples': 1997, 'number_of_characters': 591042, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'ell_Grek-pol_Latn': {'num_samples': 1997, 'number_of_characters': 576993, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ell_Grek-por_Latn': {'num_samples': 1997, 'number_of_characters': 576342, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'ell_Grek-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 574020, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'ell_Grek-spa_Latn': {'num_samples': 1997, 'number_of_characters': 587327, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'ell_Grek-sqi_Latn': {'num_samples': 1997, 'number_of_characters': 582734, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 142.02, 'max_sentence2_length': 461, 'unique_sentence2': 1996}, 'ell_Grek-swa_Latn': {'num_samples': 1997, 'number_of_characters': 571636, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'ell_Grek-swe_Latn': {'num_samples': 1997, 'number_of_characters': 550954, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'ell_Grek-tam_Taml': {'num_samples': 1997, 'number_of_characters': 609088, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'ell_Grek-tur_Latn': {'num_samples': 1997, 'number_of_characters': 564740, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'ell_Grek-vie_Latn': {'num_samples': 1997, 'number_of_characters': 570248, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'ell_Grek-zho_Hant': {'num_samples': 1997, 'number_of_characters': 390605, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'ell_Grek-zul_Latn': {'num_samples': 1997, 'number_of_characters': 556859, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'eng_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 516072, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'eng_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 415227, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'eng_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 478901, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'eng_Latn-aze_Latn': {'num_samples': 1997, 'number_of_characters': 517354, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'eng_Latn-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 494046, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'eng_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 503810, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'eng_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 546212, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'eng_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 491522, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'eng_Latn-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 548349, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'eng_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 502630, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'eng_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 517609, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'eng_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 530680, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'eng_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 489038, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'eng_Latn-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 500087, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'eng_Latn-cym_Latn': {'num_samples': 1997, 'number_of_characters': 514225, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.45, 'max_sentence2_length': 444, 'unique_sentence2': 1997}, 'eng_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 499858, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'eng_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 543370, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'eng_Latn-div_Thaa': {'num_samples': 1997, 'number_of_characters': 551568, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'eng_Latn-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 490941, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'eng_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 546847, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'eng_Latn-eus_Latn': {'num_samples': 1997, 'number_of_characters': 522923, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'eng_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 486698, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'eng_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 505523, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'eng_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 491059, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'eng_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 548225, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'eng_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 541140, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'eng_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 517520, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'eng_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 540828, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'eng_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 476200, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'eng_Latn-gle_Latn': {'num_samples': 1997, 'number_of_characters': 542529, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 147.63, 'max_sentence2_length': 461, 'unique_sentence2': 1997}, 'eng_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 519706, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'eng_Latn-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 492651, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'eng_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 517686, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'eng_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 448016, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'eng_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 509295, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'eng_Latn-hmn_Latn': {'num_samples': 1997, 'number_of_characters': 578510, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 165.64, 'max_sentence2_length': 643, 'unique_sentence2': 1997}, 'eng_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 503645, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'eng_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 526096, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'eng_Latn-hye_Armn': {'num_samples': 1997, 'number_of_characters': 512435, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 132.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'eng_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 493821, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'eng_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 534692, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'eng_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 509928, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'eng_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 536937, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'eng_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 359346, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'eng_Latn-kan_Knda': {'num_samples': 1997, 'number_of_characters': 513256, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'eng_Latn-kat_Geor': {'num_samples': 1997, 'number_of_characters': 514312, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 133.5, 'max_sentence2_length': 503, 'unique_sentence2': 1995}, 'eng_Latn-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 507996, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'eng_Latn-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 536211, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'eng_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 551507, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'eng_Latn-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 498584, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'eng_Latn-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 493666, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'eng_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 381125, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'eng_Latn-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 514700, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'eng_Latn-lav_Latn': {'num_samples': 1997, 'number_of_characters': 515908, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 134.3, 'max_sentence2_length': 503, 'unique_sentence2': 1994}, 'eng_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 507117, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'eng_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 528477, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'eng_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 551872, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'eng_Latn-mar_Deva': {'num_samples': 1997, 'number_of_characters': 508607, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'eng_Latn-mey_Arab': {'num_samples': 1997, 'number_of_characters': 461555, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'eng_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 515611, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'eng_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 568028, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'eng_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 525195, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'eng_Latn-mon_Mong': {'num_samples': 1997, 'number_of_characters': 506768, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'eng_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 521844, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'eng_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 524903, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'eng_Latn-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 559574, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'eng_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 545459, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'eng_Latn-nep_Deva': {'num_samples': 1997, 'number_of_characters': 495943, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'eng_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 539635, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'eng_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 496077, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'eng_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 499164, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'eng_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 539219, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'eng_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 532002, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'eng_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 485151, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'eng_Latn-pan_Guru': {'num_samples': 1997, 'number_of_characters': 498142, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'eng_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 525586, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'eng_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 524935, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'eng_Latn-prs_Arab': {'num_samples': 1997, 'number_of_characters': 490256, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'eng_Latn-pus_Arab': {'num_samples': 1997, 'number_of_characters': 490353, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'eng_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 540205, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'eng_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 522613, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'eng_Latn-shi_Arab': {'num_samples': 1997, 'number_of_characters': 462633, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'eng_Latn-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 506461, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'eng_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 500689, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'eng_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 500616, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'eng_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 525575, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'eng_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 546050, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'eng_Latn-snd_Arab': {'num_samples': 1997, 'number_of_characters': 468047, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'eng_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 539012, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'eng_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 535920, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'eng_Latn-sqi_Latn': {'num_samples': 1997, 'number_of_characters': 531327, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 142.02, 'max_sentence2_length': 461, 'unique_sentence2': 1996}, 'eng_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 500023, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'eng_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 503861, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'eng_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 535862, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'eng_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 520229, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'eng_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 499547, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'eng_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 557343, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'eng_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 557681, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'eng_Latn-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 493646, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'eng_Latn-tel_Telu': {'num_samples': 1997, 'number_of_characters': 495247, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'eng_Latn-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 521867, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'eng_Latn-tha_Thai': {'num_samples': 1997, 'number_of_characters': 485188, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'eng_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 412958, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'eng_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 561360, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'eng_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 582003, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'eng_Latn-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 532994, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'eng_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 513333, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'eng_Latn-uig_Arab': {'num_samples': 1997, 'number_of_characters': 558742, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'eng_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 510503, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'eng_Latn-urd_Arab': {'num_samples': 1997, 'number_of_characters': 495718, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'eng_Latn-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 541415, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'eng_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 547476, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'eng_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 518841, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'eng_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 487523, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'eng_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 515810, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'eng_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 563808, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'eng_Latn-yue_Hant': {'num_samples': 1997, 'number_of_characters': 326607, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'eng_Latn-zho_Hans': {'num_samples': 1997, 'number_of_characters': 332681, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'eng_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 339198, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'eng_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 505452, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'eus_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 519005, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'eus_Latn-div_Thaa': {'num_samples': 1997, 'number_of_characters': 579051, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'eus_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 522923, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'eus_Latn-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 520134, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'eus_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 536778, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'eus_Latn-kan_Knda': {'num_samples': 1997, 'number_of_characters': 540739, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'eus_Latn-mar_Deva': {'num_samples': 1997, 'number_of_characters': 536090, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'eus_Latn-nep_Deva': {'num_samples': 1997, 'number_of_characters': 523426, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'eus_Latn-pan_Guru': {'num_samples': 1997, 'number_of_characters': 525625, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'eus_Latn-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 533944, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'eus_Latn-snd_Arab': {'num_samples': 1997, 'number_of_characters': 495530, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'eus_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 585164, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'eus_Latn-tel_Telu': {'num_samples': 1997, 'number_of_characters': 522730, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'eus_Latn-urd_Arab': {'num_samples': 1997, 'number_of_characters': 523201, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'ewe_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 537470, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'ewe_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 486698, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ewe_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 467458, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'ewe_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 542765, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'ewe_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 536717, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'ewe_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 523260, 'unique_pairs': 1995, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'ewe_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 537308, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'ewe_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 538734, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'fao_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 526155, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'fao_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 509941, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'fao_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 553453, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'fao_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 505523, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fao_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 520011, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'fao_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 538560, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'fao_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 549718, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'fao_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 506160, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'fao_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 509247, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'fao_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 509630, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'fas_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 474520, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'fas_Arab-ben_Beng': {'num_samples': 1997, 'number_of_characters': 487141, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'fas_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 495706, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'fas_Arab-deu_Latn': {'num_samples': 1997, 'number_of_characters': 538989, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'fas_Arab-ell_Grek': {'num_samples': 1997, 'number_of_characters': 542466, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'fas_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 491059, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fas_Arab-fin_Latn': {'num_samples': 1997, 'number_of_characters': 513139, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'fas_Arab-fra_Latn': {'num_samples': 1997, 'number_of_characters': 536447, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'fas_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 443635, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'fas_Arab-hin_Deva': {'num_samples': 1997, 'number_of_characters': 504914, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'fas_Arab-hun_Latn': {'num_samples': 1997, 'number_of_characters': 521715, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'fas_Arab-ind_Latn': {'num_samples': 1997, 'number_of_characters': 530311, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'fas_Arab-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 354965, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'fas_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 489285, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'fas_Arab-kor_Hang': {'num_samples': 1997, 'number_of_characters': 376744, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'fas_Arab-lit_Latn': {'num_samples': 1997, 'number_of_characters': 502736, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'fas_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 457174, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'fas_Arab-nld_Latn': {'num_samples': 1997, 'number_of_characters': 535254, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'fas_Arab-pol_Latn': {'num_samples': 1997, 'number_of_characters': 521205, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'fas_Arab-por_Latn': {'num_samples': 1997, 'number_of_characters': 520554, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'fas_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 485875, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'fas_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 485972, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'fas_Arab-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 518232, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'fas_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 458252, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'fas_Arab-spa_Latn': {'num_samples': 1997, 'number_of_characters': 531539, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'fas_Arab-swa_Latn': {'num_samples': 1997, 'number_of_characters': 515848, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'fas_Arab-swe_Latn': {'num_samples': 1997, 'number_of_characters': 495166, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'fas_Arab-tam_Taml': {'num_samples': 1997, 'number_of_characters': 553300, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'fas_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 517486, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'fas_Arab-tur_Latn': {'num_samples': 1997, 'number_of_characters': 508952, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'fas_Arab-vie_Latn': {'num_samples': 1997, 'number_of_characters': 514460, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'fas_Arab-zho_Hant': {'num_samples': 1997, 'number_of_characters': 334817, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'fas_Arab-zul_Latn': {'num_samples': 1997, 'number_of_characters': 501071, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'fij_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 548225, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fij_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 593925, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'fij_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 587477, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'fij_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 604657, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'fij_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 620813, 'unique_pairs': 1995, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'fij_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 574629, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'fij_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 577688, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'fij_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 578360, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'fij_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 610128, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'fij_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 614145, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'fil_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 541140, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fil_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 593925, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'fil_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 580392, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'fil_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 597572, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'fil_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 613728, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'fil_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 567544, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'fil_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 570603, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'fil_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 571275, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'fil_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 603043, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'fil_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 607060, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'fin_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 500981, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'fin_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 513602, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'fin_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 565450, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'fin_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 568927, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'fin_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 517520, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fin_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 513139, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'fin_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 562908, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'fin_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 470096, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'fin_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 531375, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'fin_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 548176, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'fin_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 556772, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'fin_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 381426, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'fin_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 403205, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'fin_Latn-lav_Latn': {'num_samples': 1997, 'number_of_characters': 537988, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.3, 'max_sentence2_length': 503, 'unique_sentence2': 1994}, 'fin_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 529197, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'fin_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 561715, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'fin_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 547666, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'fin_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 547015, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'fin_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 544693, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'fin_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 558000, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'fin_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 542309, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'fin_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 521627, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'fin_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 579761, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'fin_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 535413, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'fin_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 540921, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'fin_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 361278, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'fin_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 527532, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'fra_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 524289, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'fra_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 536910, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'fra_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 576068, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'fra_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 588758, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'fra_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 592235, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'fra_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 540828, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fra_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 536447, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'fra_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 562908, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'fra_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 565094, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'fra_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 493404, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'fra_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 554683, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'fra_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 571484, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'fra_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 580080, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'fra_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 582325, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'fra_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 404734, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'fra_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 426513, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'fra_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 552505, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'fra_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 570583, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'fra_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 585023, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'fra_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 570974, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'fra_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 570323, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'fra_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 585593, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'fra_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 568001, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'fra_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 581308, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'fra_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 565617, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'fra_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 544935, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'fra_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 603069, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'fra_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 558721, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'fra_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 564229, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'fra_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 384586, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'fra_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 550840, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'fuc_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 526972, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'fuc_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 476200, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fuc_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 467458, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'fuc_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 532267, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'fuc_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 526219, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'fuc_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 512762, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'fuc_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 526810, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'fuc_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 528236, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'gle_Latn-cym_Latn': {'num_samples': 1997, 'number_of_characters': 561314, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 147.63, 'max_sentence1_length': 461, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.45, 'max_sentence2_length': 444, 'unique_sentence2': 1997}, 'gle_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 542529, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 147.63, 'max_sentence1_length': 461, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'glg_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 554946, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'glg_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 519706, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'glg_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 565094, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'glg_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 561203, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'glg_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 549461, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'glg_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 549201, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'glg_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 564471, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'glg_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 560186, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'guj_Gujr-ben_Beng': {'num_samples': 1997, 'number_of_characters': 488733, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'guj_Gujr-div_Thaa': {'num_samples': 1997, 'number_of_characters': 548779, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'guj_Gujr-eng_Latn': {'num_samples': 1997, 'number_of_characters': 492651, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'guj_Gujr-eus_Latn': {'num_samples': 1997, 'number_of_characters': 520134, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'guj_Gujr-hin_Deva': {'num_samples': 1997, 'number_of_characters': 506506, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'guj_Gujr-kan_Knda': {'num_samples': 1997, 'number_of_characters': 510467, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'guj_Gujr-mar_Deva': {'num_samples': 1997, 'number_of_characters': 505818, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'guj_Gujr-nep_Deva': {'num_samples': 1997, 'number_of_characters': 493154, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'guj_Gujr-pan_Guru': {'num_samples': 1997, 'number_of_characters': 495353, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'guj_Gujr-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 503672, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'guj_Gujr-snd_Arab': {'num_samples': 1997, 'number_of_characters': 465258, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'guj_Gujr-tam_Taml': {'num_samples': 1997, 'number_of_characters': 554892, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'guj_Gujr-tel_Telu': {'num_samples': 1997, 'number_of_characters': 492458, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'guj_Gujr-urd_Arab': {'num_samples': 1997, 'number_of_characters': 492929, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'hau_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 437473, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'hau_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 517686, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hau_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 516067, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'hau_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 561465, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'hau_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 507397, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'hau_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 561258, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'hau_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 558108, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'hau_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 542475, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'hau_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 435204, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'hau_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 604249, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'hau_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 509769, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'hau_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 538056, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'hau_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 586054, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'hau_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 527698, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'heb_Hebr-arb_Arab': {'num_samples': 1997, 'number_of_characters': 431477, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'heb_Hebr-ben_Beng': {'num_samples': 1997, 'number_of_characters': 444098, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'heb_Hebr-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 452663, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'heb_Hebr-deu_Latn': {'num_samples': 1997, 'number_of_characters': 495946, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'heb_Hebr-ell_Grek': {'num_samples': 1997, 'number_of_characters': 499423, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'heb_Hebr-eng_Latn': {'num_samples': 1997, 'number_of_characters': 448016, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'heb_Hebr-fas_Arab': {'num_samples': 1997, 'number_of_characters': 443635, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'heb_Hebr-fin_Latn': {'num_samples': 1997, 'number_of_characters': 470096, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'heb_Hebr-fra_Latn': {'num_samples': 1997, 'number_of_characters': 493404, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'heb_Hebr-hin_Deva': {'num_samples': 1997, 'number_of_characters': 461871, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'heb_Hebr-hun_Latn': {'num_samples': 1997, 'number_of_characters': 478672, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'heb_Hebr-ind_Latn': {'num_samples': 1997, 'number_of_characters': 487268, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'heb_Hebr-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 311922, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'heb_Hebr-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 446242, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'heb_Hebr-kor_Hang': {'num_samples': 1997, 'number_of_characters': 333701, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'heb_Hebr-lit_Latn': {'num_samples': 1997, 'number_of_characters': 459693, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'heb_Hebr-mey_Arab': {'num_samples': 1997, 'number_of_characters': 414131, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'heb_Hebr-nld_Latn': {'num_samples': 1997, 'number_of_characters': 492211, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'heb_Hebr-pol_Latn': {'num_samples': 1997, 'number_of_characters': 478162, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'heb_Hebr-por_Latn': {'num_samples': 1997, 'number_of_characters': 477511, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'heb_Hebr-prs_Arab': {'num_samples': 1997, 'number_of_characters': 442832, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'heb_Hebr-pus_Arab': {'num_samples': 1997, 'number_of_characters': 442929, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'heb_Hebr-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 475189, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'heb_Hebr-shi_Arab': {'num_samples': 1997, 'number_of_characters': 415209, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'heb_Hebr-spa_Latn': {'num_samples': 1997, 'number_of_characters': 488496, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'heb_Hebr-swa_Latn': {'num_samples': 1997, 'number_of_characters': 472805, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'heb_Hebr-swe_Latn': {'num_samples': 1997, 'number_of_characters': 452123, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'heb_Hebr-tam_Taml': {'num_samples': 1997, 'number_of_characters': 510257, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'heb_Hebr-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 474443, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'heb_Hebr-tur_Latn': {'num_samples': 1997, 'number_of_characters': 465909, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'heb_Hebr-vie_Latn': {'num_samples': 1997, 'number_of_characters': 471417, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'heb_Hebr-zho_Hant': {'num_samples': 1997, 'number_of_characters': 291774, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'heb_Hebr-zul_Latn': {'num_samples': 1997, 'number_of_characters': 458028, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'hin_Deva-arb_Arab': {'num_samples': 1997, 'number_of_characters': 492756, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'hin_Deva-ben_Beng': {'num_samples': 1997, 'number_of_characters': 505377, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'hin_Deva-deu_Latn': {'num_samples': 1997, 'number_of_characters': 557225, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'hin_Deva-div_Thaa': {'num_samples': 1997, 'number_of_characters': 565423, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'hin_Deva-ell_Grek': {'num_samples': 1997, 'number_of_characters': 560702, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'hin_Deva-eng_Latn': {'num_samples': 1997, 'number_of_characters': 509295, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hin_Deva-eus_Latn': {'num_samples': 1997, 'number_of_characters': 536778, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'hin_Deva-fas_Arab': {'num_samples': 1997, 'number_of_characters': 504914, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'hin_Deva-fin_Latn': {'num_samples': 1997, 'number_of_characters': 531375, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'hin_Deva-fra_Latn': {'num_samples': 1997, 'number_of_characters': 554683, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'hin_Deva-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 506506, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'hin_Deva-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 461871, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'hin_Deva-hun_Latn': {'num_samples': 1997, 'number_of_characters': 539951, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'hin_Deva-ind_Latn': {'num_samples': 1997, 'number_of_characters': 548547, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'hin_Deva-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 373201, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'hin_Deva-kan_Knda': {'num_samples': 1997, 'number_of_characters': 527111, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'hin_Deva-kor_Hang': {'num_samples': 1997, 'number_of_characters': 394980, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'hin_Deva-lit_Latn': {'num_samples': 1997, 'number_of_characters': 520972, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'hin_Deva-mar_Deva': {'num_samples': 1997, 'number_of_characters': 522462, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'hin_Deva-nep_Deva': {'num_samples': 1997, 'number_of_characters': 509798, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'hin_Deva-nld_Latn': {'num_samples': 1997, 'number_of_characters': 553490, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'hin_Deva-pan_Guru': {'num_samples': 1997, 'number_of_characters': 511997, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'hin_Deva-pol_Latn': {'num_samples': 1997, 'number_of_characters': 539441, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'hin_Deva-por_Latn': {'num_samples': 1997, 'number_of_characters': 538790, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'hin_Deva-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 536468, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'hin_Deva-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 520316, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'hin_Deva-snd_Arab': {'num_samples': 1997, 'number_of_characters': 481902, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'hin_Deva-spa_Latn': {'num_samples': 1997, 'number_of_characters': 549775, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'hin_Deva-swa_Latn': {'num_samples': 1997, 'number_of_characters': 534084, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'hin_Deva-swe_Latn': {'num_samples': 1997, 'number_of_characters': 513402, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'hin_Deva-tam_Taml': {'num_samples': 1997, 'number_of_characters': 571536, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'hin_Deva-tel_Telu': {'num_samples': 1997, 'number_of_characters': 509102, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'hin_Deva-tur_Latn': {'num_samples': 1997, 'number_of_characters': 527188, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'hin_Deva-urd_Arab': {'num_samples': 1997, 'number_of_characters': 509573, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'hin_Deva-vie_Latn': {'num_samples': 1997, 'number_of_characters': 532696, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'hin_Deva-zho_Hant': {'num_samples': 1997, 'number_of_characters': 353053, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'hin_Deva-zul_Latn': {'num_samples': 1997, 'number_of_characters': 519307, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'hmn_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 578510, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 165.64, 'max_sentence1_length': 643, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hrv_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 512015, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'hrv_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 510835, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'hrv_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 525814, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'hrv_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 497243, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'hrv_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 503645, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hrv_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 523816, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'hrv_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 533791, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'hrv_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 530818, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'hrv_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 508894, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'hrv_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 508821, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'hrv_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 508228, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'hrv_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 512066, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'hrv_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 518708, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'hun_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 509557, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'hun_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 522178, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'hun_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 574026, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'hun_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 577503, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'hun_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 526096, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hun_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 521715, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'hun_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 548176, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'hun_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 571484, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'hun_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 478672, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'hun_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 539951, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'hun_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 565348, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'hun_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 390002, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'hun_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 411781, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'hun_Latn-lav_Latn': {'num_samples': 1997, 'number_of_characters': 546564, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 134.3, 'max_sentence2_length': 503, 'unique_sentence2': 1994}, 'hun_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 537773, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'hun_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 570291, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'hun_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 556242, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'hun_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 555591, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'hun_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 553269, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'hun_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 566576, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'hun_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 550885, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'hun_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 530203, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'hun_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 588337, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'hun_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 543989, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'hun_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 549497, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'hun_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 369854, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'hun_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 536108, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'hye_Armn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 563842, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 132.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'hye_Armn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 512435, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 132.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hye_Armn-kat_Geor': {'num_samples': 1997, 'number_of_characters': 531307, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 132.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 133.5, 'max_sentence2_length': 503, 'unique_sentence2': 1995}, 'hye_Armn-sqi_Latn': {'num_samples': 1997, 'number_of_characters': 548322, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 132.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 142.02, 'max_sentence2_length': 461, 'unique_sentence2': 1996}, 'ibo_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 413608, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'ibo_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 493821, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ibo_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 516067, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'ibo_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 537600, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'ibo_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 483532, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'ibo_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 537393, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'ibo_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 534243, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'ibo_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 518610, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'ibo_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 411339, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'ibo_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 580384, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'ibo_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 485904, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'ibo_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 514191, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'ibo_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 562189, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'ibo_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 503833, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'ind_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 518153, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'ind_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 530774, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'ind_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 582622, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'ind_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 586099, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'ind_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 534692, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ind_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 530311, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'ind_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 587477, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'ind_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 580392, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'ind_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 556772, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'ind_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 580080, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'ind_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 487268, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'ind_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 548547, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'ind_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 565348, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'ind_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 398598, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'ind_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 420377, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'ind_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 546369, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'ind_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 591124, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'ind_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 607280, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'ind_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 561096, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'ind_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 564155, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'ind_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 578887, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'ind_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 564838, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ind_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 564187, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'ind_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 561865, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'ind_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 564827, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'ind_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 575172, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'ind_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 559481, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'ind_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 538799, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'ind_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 596595, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'ind_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 596933, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'ind_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 600612, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'ind_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 552585, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'ind_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 558093, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'ind_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 378450, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'ind_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 544704, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'isl_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 530560, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'isl_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 514346, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'isl_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 557858, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'isl_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 509928, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'isl_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 520011, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'isl_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 542965, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'isl_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 554123, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'isl_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 510565, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'isl_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 513652, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'isl_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 514035, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'ita_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 572177, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'ita_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 536937, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ita_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 582325, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'ita_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 561203, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ita_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 566692, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'ita_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 566432, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'ita_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 581702, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'ita_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 577417, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'jpn_Jpan-arb_Arab': {'num_samples': 1997, 'number_of_characters': 342807, 'unique_pairs': 1995, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'jpn_Jpan-ben_Beng': {'num_samples': 1997, 'number_of_characters': 355428, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'jpn_Jpan-deu_Latn': {'num_samples': 1997, 'number_of_characters': 407276, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'jpn_Jpan-ell_Grek': {'num_samples': 1997, 'number_of_characters': 410753, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'jpn_Jpan-eng_Latn': {'num_samples': 1997, 'number_of_characters': 359346, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'jpn_Jpan-fas_Arab': {'num_samples': 1997, 'number_of_characters': 354965, 'unique_pairs': 1995, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'jpn_Jpan-fin_Latn': {'num_samples': 1997, 'number_of_characters': 381426, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'jpn_Jpan-fra_Latn': {'num_samples': 1997, 'number_of_characters': 404734, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'jpn_Jpan-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 311922, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'jpn_Jpan-hin_Deva': {'num_samples': 1997, 'number_of_characters': 373201, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'jpn_Jpan-hun_Latn': {'num_samples': 1997, 'number_of_characters': 390002, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'jpn_Jpan-ind_Latn': {'num_samples': 1997, 'number_of_characters': 398598, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'jpn_Jpan-kor_Hang': {'num_samples': 1997, 'number_of_characters': 245031, 'unique_pairs': 1995, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'jpn_Jpan-lit_Latn': {'num_samples': 1997, 'number_of_characters': 371023, 'unique_pairs': 1995, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'jpn_Jpan-nld_Latn': {'num_samples': 1997, 'number_of_characters': 403541, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'jpn_Jpan-pol_Latn': {'num_samples': 1997, 'number_of_characters': 389492, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'jpn_Jpan-por_Latn': {'num_samples': 1997, 'number_of_characters': 388841, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'jpn_Jpan-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 386519, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'jpn_Jpan-spa_Latn': {'num_samples': 1997, 'number_of_characters': 399826, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'jpn_Jpan-swa_Latn': {'num_samples': 1997, 'number_of_characters': 384135, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'jpn_Jpan-swe_Latn': {'num_samples': 1997, 'number_of_characters': 363453, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'jpn_Jpan-tam_Taml': {'num_samples': 1997, 'number_of_characters': 421587, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'jpn_Jpan-tur_Latn': {'num_samples': 1997, 'number_of_characters': 377239, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'jpn_Jpan-vie_Latn': {'num_samples': 1997, 'number_of_characters': 382747, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'jpn_Jpan-yue_Hant': {'num_samples': 1997, 'number_of_characters': 190513, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'jpn_Jpan-zho_Hans': {'num_samples': 1997, 'number_of_characters': 196587, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'jpn_Jpan-zho_Hant': {'num_samples': 1997, 'number_of_characters': 203104, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'jpn_Jpan-zul_Latn': {'num_samples': 1997, 'number_of_characters': 369358, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'kan_Knda-ben_Beng': {'num_samples': 1997, 'number_of_characters': 509338, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'kan_Knda-div_Thaa': {'num_samples': 1997, 'number_of_characters': 569384, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'kan_Knda-eng_Latn': {'num_samples': 1997, 'number_of_characters': 513256, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kan_Knda-eus_Latn': {'num_samples': 1997, 'number_of_characters': 540739, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'kan_Knda-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 510467, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'kan_Knda-hin_Deva': {'num_samples': 1997, 'number_of_characters': 527111, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'kan_Knda-mar_Deva': {'num_samples': 1997, 'number_of_characters': 526423, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'kan_Knda-nep_Deva': {'num_samples': 1997, 'number_of_characters': 513759, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'kan_Knda-pan_Guru': {'num_samples': 1997, 'number_of_characters': 515958, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'kan_Knda-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 524277, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'kan_Knda-snd_Arab': {'num_samples': 1997, 'number_of_characters': 485863, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'kan_Knda-tam_Taml': {'num_samples': 1997, 'number_of_characters': 575497, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'kan_Knda-tel_Telu': {'num_samples': 1997, 'number_of_characters': 513063, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'kan_Knda-urd_Arab': {'num_samples': 1997, 'number_of_characters': 513534, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'kat_Geor-ell_Grek': {'num_samples': 1997, 'number_of_characters': 565719, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 133.5, 'max_sentence1_length': 503, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'kat_Geor-eng_Latn': {'num_samples': 1997, 'number_of_characters': 514312, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 133.5, 'max_sentence1_length': 503, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kat_Geor-hye_Armn': {'num_samples': 1997, 'number_of_characters': 531307, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 133.5, 'max_sentence1_length': 503, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 132.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'kat_Geor-sqi_Latn': {'num_samples': 1997, 'number_of_characters': 550199, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 133.5, 'max_sentence1_length': 503, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 142.02, 'max_sentence2_length': 461, 'unique_sentence2': 1996}, 'kaz_Cyrl-aze_Latn': {'num_samples': 1997, 'number_of_characters': 529910, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'kaz_Cyrl-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 506602, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'kaz_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 507996, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kaz_Cyrl-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 511140, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'kaz_Cyrl-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 506202, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'kaz_Cyrl-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 545550, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'kaz_Cyrl-tur_Latn': {'num_samples': 1997, 'number_of_characters': 525889, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'kaz_Cyrl-uig_Arab': {'num_samples': 1997, 'number_of_characters': 571298, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'kaz_Cyrl-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 553971, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'khm_Khmr-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 589120, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'khm_Khmr-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 531712, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'khm_Khmr-eng_Latn': {'num_samples': 1997, 'number_of_characters': 536211, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'khm_Khmr-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 555471, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'khm_Khmr-mon_Mong': {'num_samples': 1997, 'number_of_characters': 547539, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'khm_Khmr-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 600345, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'khm_Khmr-tha_Thai': {'num_samples': 1997, 'number_of_characters': 525959, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'kin_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 602279, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'kin_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 551507, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kin_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 542765, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'kin_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 532267, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'kin_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 601526, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'kin_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 588069, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'kin_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 602117, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'kin_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 603543, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'kir_Cyrl-aze_Latn': {'num_samples': 1997, 'number_of_characters': 520498, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'kir_Cyrl-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 497190, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'kir_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 498584, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kir_Cyrl-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 511140, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'kir_Cyrl-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 496790, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'kir_Cyrl-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 536138, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'kir_Cyrl-tur_Latn': {'num_samples': 1997, 'number_of_characters': 516477, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'kir_Cyrl-uig_Arab': {'num_samples': 1997, 'number_of_characters': 561886, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'kir_Cyrl-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 544559, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'kmr_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 477127, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'kmr_Latn-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 498313, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'kmr_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 493666, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kmr_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 489285, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'kmr_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 446242, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'kmr_Latn-mey_Arab': {'num_samples': 1997, 'number_of_characters': 459781, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'kmr_Latn-prs_Arab': {'num_samples': 1997, 'number_of_characters': 488482, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'kmr_Latn-pus_Arab': {'num_samples': 1997, 'number_of_characters': 488579, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'kmr_Latn-shi_Arab': {'num_samples': 1997, 'number_of_characters': 460859, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'kmr_Latn-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 520093, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'kor_Hang-arb_Arab': {'num_samples': 1997, 'number_of_characters': 364586, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'kor_Hang-ben_Beng': {'num_samples': 1997, 'number_of_characters': 377207, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'kor_Hang-deu_Latn': {'num_samples': 1997, 'number_of_characters': 429055, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'kor_Hang-ell_Grek': {'num_samples': 1997, 'number_of_characters': 432532, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'kor_Hang-eng_Latn': {'num_samples': 1997, 'number_of_characters': 381125, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kor_Hang-fas_Arab': {'num_samples': 1997, 'number_of_characters': 376744, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'kor_Hang-fin_Latn': {'num_samples': 1997, 'number_of_characters': 403205, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'kor_Hang-fra_Latn': {'num_samples': 1997, 'number_of_characters': 426513, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'kor_Hang-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 333701, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'kor_Hang-hin_Deva': {'num_samples': 1997, 'number_of_characters': 394980, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'kor_Hang-hun_Latn': {'num_samples': 1997, 'number_of_characters': 411781, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'kor_Hang-ind_Latn': {'num_samples': 1997, 'number_of_characters': 420377, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'kor_Hang-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 245031, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'kor_Hang-lit_Latn': {'num_samples': 1997, 'number_of_characters': 392802, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'kor_Hang-nld_Latn': {'num_samples': 1997, 'number_of_characters': 425320, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'kor_Hang-pol_Latn': {'num_samples': 1997, 'number_of_characters': 411271, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'kor_Hang-por_Latn': {'num_samples': 1997, 'number_of_characters': 410620, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'kor_Hang-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 408298, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'kor_Hang-spa_Latn': {'num_samples': 1997, 'number_of_characters': 421605, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'kor_Hang-swa_Latn': {'num_samples': 1997, 'number_of_characters': 405914, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'kor_Hang-swe_Latn': {'num_samples': 1997, 'number_of_characters': 385232, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'kor_Hang-tam_Taml': {'num_samples': 1997, 'number_of_characters': 443366, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'kor_Hang-tur_Latn': {'num_samples': 1997, 'number_of_characters': 399018, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'kor_Hang-vie_Latn': {'num_samples': 1997, 'number_of_characters': 404526, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'kor_Hang-yue_Hant': {'num_samples': 1997, 'number_of_characters': 212292, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'kor_Hang-zho_Hans': {'num_samples': 1997, 'number_of_characters': 218366, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'kor_Hang-zho_Hant': {'num_samples': 1997, 'number_of_characters': 224883, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'kor_Hang-zul_Latn': {'num_samples': 1997, 'number_of_characters': 391137, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'lao_Laoo-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 567609, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'lao_Laoo-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 510201, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'lao_Laoo-eng_Latn': {'num_samples': 1997, 'number_of_characters': 514700, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'lao_Laoo-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 555471, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'lao_Laoo-mon_Mong': {'num_samples': 1997, 'number_of_characters': 526028, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'lao_Laoo-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 578834, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'lao_Laoo-tha_Thai': {'num_samples': 1997, 'number_of_characters': 504448, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'lav_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 515908, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 134.3, 'max_sentence1_length': 503, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'lav_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 537988, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.3, 'max_sentence1_length': 503, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'lav_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 546564, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 134.3, 'max_sentence1_length': 503, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'lav_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 527585, 'unique_pairs': 1995, 'min_sentence1_length': 7, 'average_sentence1_length': 134.3, 'max_sentence1_length': 503, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'lit_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 490578, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'lit_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 503199, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'lit_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 555047, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'lit_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 558524, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'lit_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 507117, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'lit_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 502736, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'lit_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 529197, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'lit_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 552505, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'lit_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 459693, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'lit_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 520972, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'lit_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 537773, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'lit_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 546369, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'lit_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 371023, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'lit_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 392802, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'lit_Latn-lav_Latn': {'num_samples': 1997, 'number_of_characters': 527585, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 134.3, 'max_sentence2_length': 503, 'unique_sentence2': 1994}, 'lit_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 551312, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'lit_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 537263, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'lit_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 536612, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'lit_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 534290, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'lit_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 547597, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'lit_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 531906, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'lit_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 511224, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'lit_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 569358, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'lit_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 525010, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'lit_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 530518, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'lit_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 350875, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'lit_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 517129, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'ltz_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 549109, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'ltz_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 532895, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'ltz_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 576407, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'ltz_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 528477, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ltz_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 538560, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'ltz_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 542965, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'ltz_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 572672, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'ltz_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 529114, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'ltz_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 532201, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'ltz_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 532584, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'mal_Mlym-eng_Latn': {'num_samples': 1997, 'number_of_characters': 551872, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mal_Mlym-fij_Latn': {'num_samples': 1997, 'number_of_characters': 604657, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'mal_Mlym-fil_Latn': {'num_samples': 1997, 'number_of_characters': 597572, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'mal_Mlym-ind_Latn': {'num_samples': 1997, 'number_of_characters': 591124, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'mal_Mlym-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 624460, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'mal_Mlym-mri_Latn': {'num_samples': 1997, 'number_of_characters': 578276, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'mal_Mlym-msa_Latn': {'num_samples': 1997, 'number_of_characters': 581335, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'mal_Mlym-smo_Latn': {'num_samples': 1997, 'number_of_characters': 582007, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'mal_Mlym-tah_Latn': {'num_samples': 1997, 'number_of_characters': 613775, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'mal_Mlym-ton_Latn': {'num_samples': 1997, 'number_of_characters': 617792, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'mar_Deva-ben_Beng': {'num_samples': 1997, 'number_of_characters': 504689, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'mar_Deva-div_Thaa': {'num_samples': 1997, 'number_of_characters': 564735, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'mar_Deva-eng_Latn': {'num_samples': 1997, 'number_of_characters': 508607, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mar_Deva-eus_Latn': {'num_samples': 1997, 'number_of_characters': 536090, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'mar_Deva-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 505818, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'mar_Deva-hin_Deva': {'num_samples': 1997, 'number_of_characters': 522462, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'mar_Deva-kan_Knda': {'num_samples': 1997, 'number_of_characters': 526423, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'mar_Deva-nep_Deva': {'num_samples': 1997, 'number_of_characters': 509110, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'mar_Deva-pan_Guru': {'num_samples': 1997, 'number_of_characters': 511309, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'mar_Deva-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 519628, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'mar_Deva-snd_Arab': {'num_samples': 1997, 'number_of_characters': 481214, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'mar_Deva-tam_Taml': {'num_samples': 1997, 'number_of_characters': 570848, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'mar_Deva-tel_Telu': {'num_samples': 1997, 'number_of_characters': 508414, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'mar_Deva-urd_Arab': {'num_samples': 1997, 'number_of_characters': 508885, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'mey_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 445016, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'mey_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 466202, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'mey_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 461555, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mey_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 457174, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'mey_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 414131, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'mey_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 459781, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'mey_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 456371, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'mey_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 456468, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'mey_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 428748, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'mey_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 487982, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'mkd_Cyrl-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 523981, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'mkd_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 522801, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'mkd_Cyrl-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 537780, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'mkd_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 509209, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'mkd_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 515611, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mkd_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 523816, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'mkd_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 545757, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'mkd_Cyrl-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 542784, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'mkd_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 520860, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'mkd_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 520787, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'mkd_Cyrl-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 520194, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'mkd_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 524032, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'mkd_Cyrl-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 530674, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'mlg_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 568028, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mlg_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 620813, 'unique_pairs': 1995, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'mlg_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 613728, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'mlg_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 607280, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'mlg_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 624460, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'mlg_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 594432, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'mlg_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 597491, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'mlg_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 598163, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'mlg_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 629931, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'mlg_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 633948, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'mlt_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 560435, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'mlt_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 525195, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mlt_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 570583, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'mlt_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 549461, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'mlt_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 566692, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'mlt_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 554690, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'mlt_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 569960, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'mlt_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 565675, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'mon_Mong-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 559677, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'mon_Mong-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 502269, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'mon_Mong-eng_Latn': {'num_samples': 1997, 'number_of_characters': 506768, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mon_Mong-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 547539, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'mon_Mong-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 526028, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'mon_Mong-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 570902, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'mon_Mong-tha_Thai': {'num_samples': 1997, 'number_of_characters': 496516, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'mri_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 521844, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mri_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 574629, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'mri_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 567544, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'mri_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 561096, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'mri_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 578276, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'mri_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 594432, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'mri_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 551307, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'mri_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 551979, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'mri_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 583747, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'mri_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 587764, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'msa_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 524903, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'msa_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 577688, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'msa_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 570603, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'msa_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 564155, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'msa_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 581335, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'msa_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 597491, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'msa_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 551307, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'msa_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 555038, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'msa_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 586806, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'msa_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 590823, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'mya_Mymr-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 612483, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'mya_Mymr-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 555075, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'mya_Mymr-eng_Latn': {'num_samples': 1997, 'number_of_characters': 559574, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mya_Mymr-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 600345, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'mya_Mymr-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 578834, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'mya_Mymr-mon_Mong': {'num_samples': 1997, 'number_of_characters': 570902, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'mya_Mymr-tha_Thai': {'num_samples': 1997, 'number_of_characters': 549322, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'nde_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 596231, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'nde_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 545459, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nde_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 536717, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'nde_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 526219, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'nde_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 601526, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'nde_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 582021, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'nde_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 596069, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'nde_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 597495, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'nep_Deva-ben_Beng': {'num_samples': 1997, 'number_of_characters': 492025, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'nep_Deva-div_Thaa': {'num_samples': 1997, 'number_of_characters': 552071, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'nep_Deva-eng_Latn': {'num_samples': 1997, 'number_of_characters': 495943, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nep_Deva-eus_Latn': {'num_samples': 1997, 'number_of_characters': 523426, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'nep_Deva-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 493154, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'nep_Deva-hin_Deva': {'num_samples': 1997, 'number_of_characters': 509798, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'nep_Deva-kan_Knda': {'num_samples': 1997, 'number_of_characters': 513759, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'nep_Deva-mar_Deva': {'num_samples': 1997, 'number_of_characters': 509110, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'nep_Deva-pan_Guru': {'num_samples': 1997, 'number_of_characters': 498645, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'nep_Deva-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 506964, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'nep_Deva-snd_Arab': {'num_samples': 1997, 'number_of_characters': 468550, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'nep_Deva-tam_Taml': {'num_samples': 1997, 'number_of_characters': 558184, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'nep_Deva-tel_Telu': {'num_samples': 1997, 'number_of_characters': 495750, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'nep_Deva-urd_Arab': {'num_samples': 1997, 'number_of_characters': 496221, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'nld_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 560267, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'nld_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 523096, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'nld_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 535717, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'nld_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 544053, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'nld_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 587565, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'nld_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 591042, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'nld_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 539635, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nld_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 549718, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'nld_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 535254, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'nld_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 561715, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'nld_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 585023, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'nld_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 492211, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'nld_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 553490, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'nld_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 570291, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'nld_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 578887, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'nld_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 554123, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'nld_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 403541, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'nld_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 425320, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'nld_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 551312, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'nld_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 572672, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'nld_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 540272, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'nld_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 543359, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'nld_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 569781, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'nld_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 569130, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'nld_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 566808, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'nld_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 580115, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'nld_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 564424, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'nld_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 543742, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'nld_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 601876, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'nld_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 557528, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'nld_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 563036, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'nld_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 383393, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'nld_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 549647, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'nno_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 516709, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'nno_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 500495, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'nno_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 544007, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'nno_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 496077, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nno_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 506160, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'nno_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 510565, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'nno_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 529114, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'nno_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 540272, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'nno_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 499801, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'nno_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 500184, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'nob_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 519796, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'nob_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 503582, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'nob_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 547094, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'nob_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 499164, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nob_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 509247, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'nob_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 513652, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'nob_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 532201, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'nob_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 543359, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'nob_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 499801, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'nob_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 503271, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'nso_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 459006, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'nso_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 539219, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nso_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 561465, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'nso_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 537600, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'nso_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 528930, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'nso_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 582791, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'nso_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 579641, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'nso_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 564008, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'nso_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 456737, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'nso_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 625782, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'nso_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 531302, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'nso_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 559589, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'nso_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 607587, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'nso_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 549231, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'nya_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 582774, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'nya_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 532002, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nya_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 523260, 'unique_pairs': 1995, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'nya_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 512762, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'nya_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 588069, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'nya_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 582021, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'nya_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 582612, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'nya_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 584038, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'orm_Ethi-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 404938, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'orm_Ethi-eng_Latn': {'num_samples': 1997, 'number_of_characters': 485151, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'orm_Ethi-hau_Latn': {'num_samples': 1997, 'number_of_characters': 507397, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'orm_Ethi-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 483532, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'orm_Ethi-nso_Latn': {'num_samples': 1997, 'number_of_characters': 528930, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'orm_Ethi-som_Latn': {'num_samples': 1997, 'number_of_characters': 528723, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'orm_Ethi-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 525573, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'orm_Ethi-swa_Latn': {'num_samples': 1997, 'number_of_characters': 509940, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'orm_Ethi-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 402669, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'orm_Ethi-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 571714, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'orm_Ethi-wol_Latn': {'num_samples': 1997, 'number_of_characters': 477234, 'unique_pairs': 1992, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'orm_Ethi-xho_Latn': {'num_samples': 1997, 'number_of_characters': 505521, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'orm_Ethi-yor_Latn': {'num_samples': 1997, 'number_of_characters': 553519, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'orm_Ethi-zul_Latn': {'num_samples': 1997, 'number_of_characters': 495163, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'pan_Guru-ben_Beng': {'num_samples': 1997, 'number_of_characters': 494224, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'pan_Guru-div_Thaa': {'num_samples': 1997, 'number_of_characters': 554270, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'pan_Guru-eng_Latn': {'num_samples': 1997, 'number_of_characters': 498142, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'pan_Guru-eus_Latn': {'num_samples': 1997, 'number_of_characters': 525625, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'pan_Guru-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 495353, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'pan_Guru-hin_Deva': {'num_samples': 1997, 'number_of_characters': 511997, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'pan_Guru-kan_Knda': {'num_samples': 1997, 'number_of_characters': 515958, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'pan_Guru-mar_Deva': {'num_samples': 1997, 'number_of_characters': 511309, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'pan_Guru-nep_Deva': {'num_samples': 1997, 'number_of_characters': 498645, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'pan_Guru-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 509163, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'pan_Guru-snd_Arab': {'num_samples': 1997, 'number_of_characters': 470749, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'pan_Guru-tam_Taml': {'num_samples': 1997, 'number_of_characters': 560383, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'pan_Guru-tel_Telu': {'num_samples': 1997, 'number_of_characters': 497949, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'pan_Guru-urd_Arab': {'num_samples': 1997, 'number_of_characters': 498420, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'pol_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 509047, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'pol_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 533956, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'pol_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 521668, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'pol_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 532776, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'pol_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 547755, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'pol_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 519184, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'pol_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 573516, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'pol_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 576993, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'pol_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 525586, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'pol_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 521205, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'pol_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 547666, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'pol_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 570974, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'pol_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 478162, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'pol_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 539441, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'pol_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 533791, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'pol_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 556242, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'pol_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 564838, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'pol_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 389492, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'pol_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 411271, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'pol_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 537263, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'pol_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 545757, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'pol_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 569781, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'pol_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 555081, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'pol_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 552759, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'pol_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 530835, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'pol_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 530762, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'pol_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 566066, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'pol_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 530169, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'pol_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 534007, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'pol_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 550375, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'pol_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 529693, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'pol_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 587827, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'pol_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 543479, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'pol_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 540649, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'pol_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 548987, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'pol_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 369344, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'pol_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 535598, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'por_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 508396, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'por_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 521017, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'por_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 560175, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'por_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 572865, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'por_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 576342, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'por_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 524935, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'por_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 520554, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'por_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 547015, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'por_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 570323, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'por_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 549201, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'por_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 477511, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'por_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 538790, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'por_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 555591, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'por_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 564187, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'por_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 566432, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'por_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 388841, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'por_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 410620, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'por_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 536612, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'por_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 554690, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'por_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 569130, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'por_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 555081, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'por_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 569700, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'por_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 552108, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'por_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 565415, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'por_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 549724, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'por_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 529042, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'por_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 587176, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'por_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 542828, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'por_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 548336, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'por_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 368693, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'por_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 534947, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'prs_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 473717, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'prs_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 494903, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'prs_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 490256, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'prs_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 485875, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'prs_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 442832, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'prs_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 488482, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'prs_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 456371, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'prs_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 485169, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'prs_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 457449, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'prs_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 516683, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'pus_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 473814, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'pus_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 495000, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'pus_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 490353, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'pus_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 485972, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'pus_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 442929, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'pus_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 488579, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'pus_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 456468, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'pus_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 485169, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'pus_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 457546, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'pus_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 516780, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'ron_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 575445, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'ron_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 540205, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ron_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 585593, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'ron_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 564471, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ron_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 581702, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'ron_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 569960, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'ron_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 569700, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'ron_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 580685, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'rus_Cyrl-arb_Arab': {'num_samples': 1997, 'number_of_characters': 506074, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'rus_Cyrl-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 530983, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'rus_Cyrl-ben_Beng': {'num_samples': 1997, 'number_of_characters': 518695, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'rus_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 529803, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'rus_Cyrl-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 544782, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'rus_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 516211, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'rus_Cyrl-deu_Latn': {'num_samples': 1997, 'number_of_characters': 570543, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'rus_Cyrl-ell_Grek': {'num_samples': 1997, 'number_of_characters': 574020, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'rus_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 522613, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'rus_Cyrl-fas_Arab': {'num_samples': 1997, 'number_of_characters': 518232, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'rus_Cyrl-fin_Latn': {'num_samples': 1997, 'number_of_characters': 544693, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'rus_Cyrl-fra_Latn': {'num_samples': 1997, 'number_of_characters': 568001, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'rus_Cyrl-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 475189, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'rus_Cyrl-hin_Deva': {'num_samples': 1997, 'number_of_characters': 536468, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'rus_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 530818, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'rus_Cyrl-hun_Latn': {'num_samples': 1997, 'number_of_characters': 553269, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'rus_Cyrl-ind_Latn': {'num_samples': 1997, 'number_of_characters': 561865, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'rus_Cyrl-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 386519, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'rus_Cyrl-kor_Hang': {'num_samples': 1997, 'number_of_characters': 408298, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'rus_Cyrl-lit_Latn': {'num_samples': 1997, 'number_of_characters': 534290, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'rus_Cyrl-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 542784, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'rus_Cyrl-nld_Latn': {'num_samples': 1997, 'number_of_characters': 566808, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'rus_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 552759, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'rus_Cyrl-por_Latn': {'num_samples': 1997, 'number_of_characters': 552108, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'rus_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 527862, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'rus_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 527789, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'rus_Cyrl-spa_Latn': {'num_samples': 1997, 'number_of_characters': 563093, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'rus_Cyrl-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 527196, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'rus_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 531034, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'rus_Cyrl-swa_Latn': {'num_samples': 1997, 'number_of_characters': 547402, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'rus_Cyrl-swe_Latn': {'num_samples': 1997, 'number_of_characters': 526720, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'rus_Cyrl-tam_Taml': {'num_samples': 1997, 'number_of_characters': 584854, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'rus_Cyrl-tur_Latn': {'num_samples': 1997, 'number_of_characters': 540506, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'rus_Cyrl-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 537676, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'rus_Cyrl-vie_Latn': {'num_samples': 1997, 'number_of_characters': 546014, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'rus_Cyrl-zho_Hant': {'num_samples': 1997, 'number_of_characters': 366371, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'rus_Cyrl-zul_Latn': {'num_samples': 1997, 'number_of_characters': 532625, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'shi_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 446094, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'shi_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 467280, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'shi_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 462633, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'shi_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 458252, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'shi_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 415209, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'shi_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 460859, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'shi_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 428748, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'shi_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 457449, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'shi_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 457546, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'shi_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 489060, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'sin_Sinh-ben_Beng': {'num_samples': 1997, 'number_of_characters': 502543, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'sin_Sinh-div_Thaa': {'num_samples': 1997, 'number_of_characters': 562589, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'sin_Sinh-eng_Latn': {'num_samples': 1997, 'number_of_characters': 506461, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'sin_Sinh-eus_Latn': {'num_samples': 1997, 'number_of_characters': 533944, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'sin_Sinh-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 503672, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'sin_Sinh-hin_Deva': {'num_samples': 1997, 'number_of_characters': 520316, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'sin_Sinh-kan_Knda': {'num_samples': 1997, 'number_of_characters': 524277, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'sin_Sinh-mar_Deva': {'num_samples': 1997, 'number_of_characters': 519628, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'sin_Sinh-nep_Deva': {'num_samples': 1997, 'number_of_characters': 506964, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'sin_Sinh-pan_Guru': {'num_samples': 1997, 'number_of_characters': 509163, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'sin_Sinh-snd_Arab': {'num_samples': 1997, 'number_of_characters': 479068, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'sin_Sinh-tam_Taml': {'num_samples': 1997, 'number_of_characters': 568702, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'sin_Sinh-tel_Telu': {'num_samples': 1997, 'number_of_characters': 506268, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'sin_Sinh-urd_Arab': {'num_samples': 1997, 'number_of_characters': 506739, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'slk_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 509059, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'slk_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 507879, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'slk_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 522858, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'slk_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 494287, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'slk_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 500689, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'slk_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 508894, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'slk_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 520860, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'slk_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 530835, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'slk_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 527862, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'slk_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 505865, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'slk_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 505272, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'slk_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 509110, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'slk_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 515752, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'slv_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 508986, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'slv_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 507806, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'slv_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 522785, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'slv_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 494214, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'slv_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 500616, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'slv_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 508821, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'slv_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 520787, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'slv_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 530762, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'slv_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 527789, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'slv_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 505865, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'slv_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 505199, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'slv_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 509037, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'slv_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 515679, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'smo_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 525575, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'smo_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 578360, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'smo_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 571275, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'smo_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 564827, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'smo_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 582007, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'smo_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 598163, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'smo_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 551979, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'smo_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 555038, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'smo_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 587478, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'smo_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 591495, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'sna_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 596822, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'sna_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 546050, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'sna_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 537308, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'sna_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 526810, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'sna_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 602117, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'sna_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 596069, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'sna_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 582612, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'sna_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 598086, 'unique_pairs': 1995, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'snd_Arab-ben_Beng': {'num_samples': 1997, 'number_of_characters': 464129, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'snd_Arab-div_Thaa': {'num_samples': 1997, 'number_of_characters': 524175, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'snd_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 468047, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'snd_Arab-eus_Latn': {'num_samples': 1997, 'number_of_characters': 495530, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'snd_Arab-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 465258, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'snd_Arab-hin_Deva': {'num_samples': 1997, 'number_of_characters': 481902, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'snd_Arab-kan_Knda': {'num_samples': 1997, 'number_of_characters': 485863, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'snd_Arab-mar_Deva': {'num_samples': 1997, 'number_of_characters': 481214, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'snd_Arab-nep_Deva': {'num_samples': 1997, 'number_of_characters': 468550, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'snd_Arab-pan_Guru': {'num_samples': 1997, 'number_of_characters': 470749, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'snd_Arab-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 479068, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'snd_Arab-tam_Taml': {'num_samples': 1997, 'number_of_characters': 530288, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'snd_Arab-tel_Telu': {'num_samples': 1997, 'number_of_characters': 467854, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'snd_Arab-urd_Arab': {'num_samples': 1997, 'number_of_characters': 468325, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'som_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 458799, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'som_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 539012, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'som_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 561258, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'som_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 537393, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'som_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 582791, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'som_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 528723, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'som_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 579434, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'som_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 563801, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'som_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 456530, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'som_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 625575, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'som_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 531095, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'som_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 559382, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'som_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 607380, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'som_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 549024, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'spa_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 519381, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'spa_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 532002, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'spa_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 571160, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'spa_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 583850, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'spa_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 587327, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'spa_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 535920, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'spa_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 531539, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'spa_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 558000, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'spa_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 581308, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'spa_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 560186, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'spa_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 488496, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'spa_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 549775, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'spa_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 566576, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'spa_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 575172, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'spa_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 577417, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'spa_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 399826, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'spa_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 421605, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'spa_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 547597, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'spa_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 565675, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'spa_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 580115, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'spa_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 566066, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'spa_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 565415, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'spa_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 580685, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'spa_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 563093, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'spa_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 560709, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'spa_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 540027, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'spa_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 598161, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'spa_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 553813, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'spa_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 559321, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'spa_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 379678, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'spa_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 545932, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'sqi_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 582734, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 142.02, 'max_sentence1_length': 461, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'sqi_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 531327, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 142.02, 'max_sentence1_length': 461, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'sqi_Latn-hye_Armn': {'num_samples': 1997, 'number_of_characters': 548322, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 142.02, 'max_sentence1_length': 461, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 132.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'sqi_Latn-kat_Geor': {'num_samples': 1997, 'number_of_characters': 550199, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 142.02, 'max_sentence1_length': 461, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 133.5, 'max_sentence2_length': 503, 'unique_sentence2': 1995}, 'srp_Cyrl-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 508393, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'srp_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 507213, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'srp_Cyrl-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 522192, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'srp_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 493621, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'srp_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 500023, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'srp_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 508228, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'srp_Cyrl-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 520194, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'srp_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 530169, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'srp_Cyrl-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 527196, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'srp_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 505272, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'srp_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 505199, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'srp_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 508444, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'srp_Cyrl-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 515086, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'srp_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 512231, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'srp_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 511051, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'srp_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 526030, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'srp_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 497459, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'srp_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 503861, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'srp_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 512066, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'srp_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 524032, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'srp_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 534007, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'srp_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 531034, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'srp_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 509110, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'srp_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 509037, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'srp_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 508444, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'srp_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 518924, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'ssw_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 455649, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'ssw_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 535862, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ssw_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 558108, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'ssw_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 534243, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'ssw_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 579641, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'ssw_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 525573, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'ssw_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 579434, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'ssw_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 560651, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'ssw_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 453380, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'ssw_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 622425, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'ssw_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 527945, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'ssw_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 556232, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'ssw_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 604230, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'ssw_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 545874, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'swa_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 440016, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'swa_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 503690, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'swa_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 516311, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'swa_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 568159, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'swa_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 571636, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'swa_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 520229, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'swa_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 515848, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'swa_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 542309, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'swa_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 565617, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'swa_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 542475, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'swa_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 472805, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'swa_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 534084, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'swa_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 550885, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'swa_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 518610, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'swa_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 559481, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'swa_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 384135, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'swa_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 405914, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'swa_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 531906, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'swa_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 564424, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'swa_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 564008, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'swa_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 509940, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'swa_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 550375, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'swa_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 549724, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'swa_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 547402, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'swa_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 563801, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'swa_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 560709, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'swa_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 560651, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'swa_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 524336, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'swa_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 582470, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'swa_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 437747, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'swa_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 606792, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'swa_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 538122, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'swa_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 543630, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'swa_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 512312, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'swa_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 540599, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'swa_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 588597, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'swa_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 363987, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'swa_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 530241, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'swe_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 520179, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'swe_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 483008, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'swe_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 495629, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'swe_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 503965, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'swe_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 547477, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'swe_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 550954, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'swe_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 499547, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'swe_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 509630, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'swe_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 495166, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'swe_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 521627, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'swe_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 544935, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'swe_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 452123, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'swe_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 513402, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'swe_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 530203, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'swe_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 538799, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'swe_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 514035, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'swe_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 363453, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'swe_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 385232, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'swe_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 511224, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'swe_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 532584, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'swe_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 543742, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'swe_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 500184, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'swe_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 503271, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'swe_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 529693, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'swe_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 529042, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'swe_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 526720, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'swe_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 540027, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'swe_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 524336, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'swe_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 561788, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'swe_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 517440, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'swe_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 522948, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'swe_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 343305, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'swe_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 509559, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'tah_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 557343, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tah_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 610128, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'tah_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 603043, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'tah_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 596595, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'tah_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 613775, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'tah_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 629931, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'tah_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 583747, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'tah_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 586806, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'tah_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 587478, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'tah_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 623263, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'tam_Taml-arb_Arab': {'num_samples': 1997, 'number_of_characters': 541142, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'tam_Taml-ben_Beng': {'num_samples': 1997, 'number_of_characters': 553763, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'tam_Taml-deu_Latn': {'num_samples': 1997, 'number_of_characters': 605611, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'tam_Taml-div_Thaa': {'num_samples': 1997, 'number_of_characters': 613809, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'tam_Taml-ell_Grek': {'num_samples': 1997, 'number_of_characters': 609088, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'tam_Taml-eng_Latn': {'num_samples': 1997, 'number_of_characters': 557681, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tam_Taml-eus_Latn': {'num_samples': 1997, 'number_of_characters': 585164, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'tam_Taml-fas_Arab': {'num_samples': 1997, 'number_of_characters': 553300, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'tam_Taml-fin_Latn': {'num_samples': 1997, 'number_of_characters': 579761, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'tam_Taml-fra_Latn': {'num_samples': 1997, 'number_of_characters': 603069, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'tam_Taml-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 554892, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'tam_Taml-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 510257, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'tam_Taml-hin_Deva': {'num_samples': 1997, 'number_of_characters': 571536, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'tam_Taml-hun_Latn': {'num_samples': 1997, 'number_of_characters': 588337, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'tam_Taml-ind_Latn': {'num_samples': 1997, 'number_of_characters': 596933, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'tam_Taml-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 421587, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'tam_Taml-kan_Knda': {'num_samples': 1997, 'number_of_characters': 575497, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'tam_Taml-kor_Hang': {'num_samples': 1997, 'number_of_characters': 443366, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'tam_Taml-lit_Latn': {'num_samples': 1997, 'number_of_characters': 569358, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'tam_Taml-mar_Deva': {'num_samples': 1997, 'number_of_characters': 570848, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'tam_Taml-nep_Deva': {'num_samples': 1997, 'number_of_characters': 558184, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'tam_Taml-nld_Latn': {'num_samples': 1997, 'number_of_characters': 601876, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'tam_Taml-pan_Guru': {'num_samples': 1997, 'number_of_characters': 560383, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'tam_Taml-pol_Latn': {'num_samples': 1997, 'number_of_characters': 587827, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'tam_Taml-por_Latn': {'num_samples': 1997, 'number_of_characters': 587176, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'tam_Taml-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 584854, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'tam_Taml-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 568702, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'tam_Taml-snd_Arab': {'num_samples': 1997, 'number_of_characters': 530288, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'tam_Taml-spa_Latn': {'num_samples': 1997, 'number_of_characters': 598161, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'tam_Taml-swa_Latn': {'num_samples': 1997, 'number_of_characters': 582470, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'tam_Taml-swe_Latn': {'num_samples': 1997, 'number_of_characters': 561788, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'tam_Taml-tel_Telu': {'num_samples': 1997, 'number_of_characters': 557488, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'tam_Taml-tur_Latn': {'num_samples': 1997, 'number_of_characters': 575574, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'tam_Taml-urd_Arab': {'num_samples': 1997, 'number_of_characters': 557959, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'tam_Taml-vie_Latn': {'num_samples': 1997, 'number_of_characters': 581082, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'tam_Taml-zho_Hant': {'num_samples': 1997, 'number_of_characters': 401439, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'tam_Taml-zul_Latn': {'num_samples': 1997, 'number_of_characters': 567693, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'tat_Cyrl-aze_Latn': {'num_samples': 1997, 'number_of_characters': 515560, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'tat_Cyrl-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 492252, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'tat_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 493646, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tat_Cyrl-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 506202, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'tat_Cyrl-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 496790, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'tat_Cyrl-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 531200, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'tat_Cyrl-tur_Latn': {'num_samples': 1997, 'number_of_characters': 511539, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'tat_Cyrl-uig_Arab': {'num_samples': 1997, 'number_of_characters': 556948, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'tat_Cyrl-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 539621, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'tel_Telu-ben_Beng': {'num_samples': 1997, 'number_of_characters': 491329, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'tel_Telu-div_Thaa': {'num_samples': 1997, 'number_of_characters': 551375, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'tel_Telu-eng_Latn': {'num_samples': 1997, 'number_of_characters': 495247, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tel_Telu-eus_Latn': {'num_samples': 1997, 'number_of_characters': 522730, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'tel_Telu-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 492458, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'tel_Telu-hin_Deva': {'num_samples': 1997, 'number_of_characters': 509102, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'tel_Telu-kan_Knda': {'num_samples': 1997, 'number_of_characters': 513063, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'tel_Telu-mar_Deva': {'num_samples': 1997, 'number_of_characters': 508414, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'tel_Telu-nep_Deva': {'num_samples': 1997, 'number_of_characters': 495750, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'tel_Telu-pan_Guru': {'num_samples': 1997, 'number_of_characters': 497949, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'tel_Telu-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 506268, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'tel_Telu-snd_Arab': {'num_samples': 1997, 'number_of_characters': 467854, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'tel_Telu-tam_Taml': {'num_samples': 1997, 'number_of_characters': 557488, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'tel_Telu-urd_Arab': {'num_samples': 1997, 'number_of_characters': 495525, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'tgk_Cyrl-arb_Arab': {'num_samples': 1997, 'number_of_characters': 505328, 'unique_pairs': 1995, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'tgk_Cyrl-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 526514, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'tgk_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 521867, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tgk_Cyrl-fas_Arab': {'num_samples': 1997, 'number_of_characters': 517486, 'unique_pairs': 1995, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'tgk_Cyrl-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 474443, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'tgk_Cyrl-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 520093, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'tgk_Cyrl-mey_Arab': {'num_samples': 1997, 'number_of_characters': 487982, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'tgk_Cyrl-prs_Arab': {'num_samples': 1997, 'number_of_characters': 516683, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'tgk_Cyrl-pus_Arab': {'num_samples': 1997, 'number_of_characters': 516780, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'tgk_Cyrl-shi_Arab': {'num_samples': 1997, 'number_of_characters': 489060, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'tha_Thai-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 538097, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'tha_Thai-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 480689, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'tha_Thai-eng_Latn': {'num_samples': 1997, 'number_of_characters': 485188, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tha_Thai-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 525959, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'tha_Thai-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 504448, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'tha_Thai-mon_Mong': {'num_samples': 1997, 'number_of_characters': 496516, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'tha_Thai-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 549322, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'tir_Ethi-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 332745, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'tir_Ethi-eng_Latn': {'num_samples': 1997, 'number_of_characters': 412958, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tir_Ethi-hau_Latn': {'num_samples': 1997, 'number_of_characters': 435204, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'tir_Ethi-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 411339, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'tir_Ethi-nso_Latn': {'num_samples': 1997, 'number_of_characters': 456737, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'tir_Ethi-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 402669, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'tir_Ethi-som_Latn': {'num_samples': 1997, 'number_of_characters': 456530, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'tir_Ethi-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 453380, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'tir_Ethi-swa_Latn': {'num_samples': 1997, 'number_of_characters': 437747, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'tir_Ethi-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 499521, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'tir_Ethi-wol_Latn': {'num_samples': 1997, 'number_of_characters': 405041, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'tir_Ethi-xho_Latn': {'num_samples': 1997, 'number_of_characters': 433328, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'tir_Ethi-yor_Latn': {'num_samples': 1997, 'number_of_characters': 481326, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'tir_Ethi-zul_Latn': {'num_samples': 1997, 'number_of_characters': 422970, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'ton_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 561360, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ton_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 614145, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'ton_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 607060, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'ton_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 600612, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'ton_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 617792, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'ton_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 633948, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'ton_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 587764, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'ton_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 590823, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'ton_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 591495, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'ton_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 623263, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'tsn_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 501790, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'tsn_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 582003, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tsn_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 604249, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'tsn_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 580384, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'tsn_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 625782, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'tsn_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 571714, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'tsn_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 625575, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'tsn_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 622425, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'tsn_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 606792, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'tsn_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 499521, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'tsn_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 574086, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'tsn_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 602373, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'tsn_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 650371, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'tsn_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 592015, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'tuk_Latn-aze_Latn': {'num_samples': 1997, 'number_of_characters': 554908, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'tuk_Latn-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 531600, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'tuk_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 532994, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tuk_Latn-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 545550, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'tuk_Latn-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 536138, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'tuk_Latn-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 531200, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'tuk_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 550887, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'tuk_Latn-uig_Arab': {'num_samples': 1997, 'number_of_characters': 596296, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'tuk_Latn-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 578969, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'tur_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 496794, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'tur_Latn-aze_Latn': {'num_samples': 1997, 'number_of_characters': 535247, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'tur_Latn-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 511939, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'tur_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 509415, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'tur_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 561263, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'tur_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 564740, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'tur_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 513333, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tur_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 508952, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'tur_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 535413, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'tur_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 558721, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'tur_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 465909, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'tur_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 527188, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'tur_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 543989, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'tur_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 552585, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'tur_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 377239, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'tur_Latn-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 525889, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'tur_Latn-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 516477, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'tur_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 399018, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'tur_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 525010, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'tur_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 557528, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'tur_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 543479, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'tur_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 542828, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'tur_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 540506, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'tur_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 553813, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'tur_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 538122, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'tur_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 517440, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'tur_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 575574, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'tur_Latn-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 511539, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'tur_Latn-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 550887, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'tur_Latn-uig_Arab': {'num_samples': 1997, 'number_of_characters': 576635, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'tur_Latn-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 559308, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'tur_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 536734, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'tur_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 357091, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'tur_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 523345, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'uig_Arab-aze_Latn': {'num_samples': 1997, 'number_of_characters': 580656, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'uig_Arab-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 557348, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'uig_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 558742, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'uig_Arab-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 571298, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'uig_Arab-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 561886, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'uig_Arab-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 556948, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'uig_Arab-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 596296, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'uig_Arab-tur_Latn': {'num_samples': 1997, 'number_of_characters': 576635, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'uig_Arab-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 604717, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'ukr_Cyrl-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 518873, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'ukr_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 517693, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'ukr_Cyrl-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 532672, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'ukr_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 504101, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'ukr_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 510503, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ukr_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 518708, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'ukr_Cyrl-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 530674, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'ukr_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 540649, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ukr_Cyrl-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 537676, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'ukr_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 515752, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'ukr_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 515679, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'ukr_Cyrl-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 515086, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'ukr_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 518924, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'urd_Arab-ben_Beng': {'num_samples': 1997, 'number_of_characters': 491800, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'urd_Arab-div_Thaa': {'num_samples': 1997, 'number_of_characters': 551846, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'urd_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 495718, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'urd_Arab-eus_Latn': {'num_samples': 1997, 'number_of_characters': 523201, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'urd_Arab-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 492929, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'urd_Arab-hin_Deva': {'num_samples': 1997, 'number_of_characters': 509573, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'urd_Arab-kan_Knda': {'num_samples': 1997, 'number_of_characters': 513534, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'urd_Arab-mar_Deva': {'num_samples': 1997, 'number_of_characters': 508885, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'urd_Arab-nep_Deva': {'num_samples': 1997, 'number_of_characters': 496221, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'urd_Arab-pan_Guru': {'num_samples': 1997, 'number_of_characters': 498420, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'urd_Arab-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 506739, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'urd_Arab-snd_Arab': {'num_samples': 1997, 'number_of_characters': 468325, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'urd_Arab-tam_Taml': {'num_samples': 1997, 'number_of_characters': 557959, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'urd_Arab-tel_Telu': {'num_samples': 1997, 'number_of_characters': 495525, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'uzb_Latn-aze_Latn': {'num_samples': 1997, 'number_of_characters': 563329, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'uzb_Latn-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 540021, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'uzb_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 541415, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'uzb_Latn-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 553971, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'uzb_Latn-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 544559, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'uzb_Latn-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 539621, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'uzb_Latn-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 578969, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'uzb_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 559308, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'uzb_Latn-uig_Arab': {'num_samples': 1997, 'number_of_characters': 604717, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'ven_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 598248, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'ven_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 547476, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ven_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 538734, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'ven_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 528236, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'ven_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 603543, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'ven_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 597495, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'ven_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 584038, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'ven_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 598086, 'unique_pairs': 1995, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'vie_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 502302, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'vie_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 514923, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'vie_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 566771, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'vie_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 570248, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'vie_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 518841, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'vie_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 514460, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'vie_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 540921, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'vie_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 564229, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'vie_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 471417, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'vie_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 532696, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'vie_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 549497, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'vie_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 558093, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'vie_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 382747, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'vie_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 404526, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'vie_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 530518, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'vie_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 563036, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'vie_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 548987, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'vie_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 548336, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'vie_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 546014, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'vie_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 559321, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'vie_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 543630, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'vie_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 522948, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'vie_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 581082, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'vie_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 536734, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'vie_Latn-yue_Hant': {'num_samples': 1997, 'number_of_characters': 350008, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'vie_Latn-zho_Hans': {'num_samples': 1997, 'number_of_characters': 356082, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'vie_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 362599, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'vie_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 528853, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'wol_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 407310, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'wol_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 487523, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'wol_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 509769, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'wol_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 485904, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'wol_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 531302, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'wol_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 477234, 'unique_pairs': 1992, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'wol_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 531095, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'wol_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 527945, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'wol_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 512312, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'wol_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 405041, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'wol_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 574086, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'wol_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 507893, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'wol_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 555891, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'wol_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 497535, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'xho_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 435597, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'xho_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 515810, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'xho_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 538056, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'xho_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 514191, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'xho_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 559589, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'xho_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 505521, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'xho_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 559382, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'xho_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 556232, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'xho_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 540599, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'xho_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 433328, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'xho_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 602373, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'xho_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 507893, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'xho_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 584178, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'xho_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 525822, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'yor_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 483595, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'yor_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 563808, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'yor_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 586054, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'yor_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 562189, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'yor_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 607587, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'yor_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 553519, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'yor_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 607380, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'yor_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 604230, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'yor_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 588597, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'yor_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 481326, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'yor_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 650371, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'yor_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 555891, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'yor_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 584178, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'yor_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 573820, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'yue_Hant-eng_Latn': {'num_samples': 1997, 'number_of_characters': 326607, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'yue_Hant-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 190513, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'yue_Hant-kor_Hang': {'num_samples': 1997, 'number_of_characters': 212292, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'yue_Hant-vie_Latn': {'num_samples': 1997, 'number_of_characters': 350008, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'yue_Hant-zho_Hans': {'num_samples': 1997, 'number_of_characters': 163848, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'yue_Hant-zho_Hant': {'num_samples': 1997, 'number_of_characters': 170365, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'zho_Hans-eng_Latn': {'num_samples': 1997, 'number_of_characters': 332681, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'zho_Hans-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 196587, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'zho_Hans-kor_Hang': {'num_samples': 1997, 'number_of_characters': 218366, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'zho_Hans-vie_Latn': {'num_samples': 1997, 'number_of_characters': 356082, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'zho_Hans-yue_Hant': {'num_samples': 1997, 'number_of_characters': 163848, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'zho_Hans-zho_Hant': {'num_samples': 1997, 'number_of_characters': 176439, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'zho_Hant-arb_Arab': {'num_samples': 1997, 'number_of_characters': 322659, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'zho_Hant-ben_Beng': {'num_samples': 1997, 'number_of_characters': 335280, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'zho_Hant-deu_Latn': {'num_samples': 1997, 'number_of_characters': 387128, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'zho_Hant-ell_Grek': {'num_samples': 1997, 'number_of_characters': 390605, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'zho_Hant-eng_Latn': {'num_samples': 1997, 'number_of_characters': 339198, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'zho_Hant-fas_Arab': {'num_samples': 1997, 'number_of_characters': 334817, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'zho_Hant-fin_Latn': {'num_samples': 1997, 'number_of_characters': 361278, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'zho_Hant-fra_Latn': {'num_samples': 1997, 'number_of_characters': 384586, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'zho_Hant-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 291774, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'zho_Hant-hin_Deva': {'num_samples': 1997, 'number_of_characters': 353053, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'zho_Hant-hun_Latn': {'num_samples': 1997, 'number_of_characters': 369854, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'zho_Hant-ind_Latn': {'num_samples': 1997, 'number_of_characters': 378450, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'zho_Hant-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 203104, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'zho_Hant-kor_Hang': {'num_samples': 1997, 'number_of_characters': 224883, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'zho_Hant-lit_Latn': {'num_samples': 1997, 'number_of_characters': 350875, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'zho_Hant-nld_Latn': {'num_samples': 1997, 'number_of_characters': 383393, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'zho_Hant-pol_Latn': {'num_samples': 1997, 'number_of_characters': 369344, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'zho_Hant-por_Latn': {'num_samples': 1997, 'number_of_characters': 368693, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'zho_Hant-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 366371, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'zho_Hant-spa_Latn': {'num_samples': 1997, 'number_of_characters': 379678, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'zho_Hant-swa_Latn': {'num_samples': 1997, 'number_of_characters': 363987, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'zho_Hant-swe_Latn': {'num_samples': 1997, 'number_of_characters': 343305, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'zho_Hant-tam_Taml': {'num_samples': 1997, 'number_of_characters': 401439, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'zho_Hant-tur_Latn': {'num_samples': 1997, 'number_of_characters': 357091, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'zho_Hant-vie_Latn': {'num_samples': 1997, 'number_of_characters': 362599, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'zho_Hant-yue_Hant': {'num_samples': 1997, 'number_of_characters': 170365, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'zho_Hant-zho_Hans': {'num_samples': 1997, 'number_of_characters': 176439, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'zho_Hant-zul_Latn': {'num_samples': 1997, 'number_of_characters': 349210, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'zul_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 425239, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'zul_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 488913, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'zul_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 501534, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'zul_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 553382, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'zul_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 556859, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'zul_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 505452, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'zul_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 501071, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'zul_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 527532, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'zul_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 550840, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'zul_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 527698, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'zul_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 458028, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'zul_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 519307, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'zul_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 536108, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'zul_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 503833, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'zul_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 544704, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'zul_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 369358, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'zul_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 391137, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'zul_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 517129, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'zul_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 549647, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'zul_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 549231, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'zul_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 495163, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'zul_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 535598, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'zul_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 534947, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'zul_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 532625, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'zul_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 549024, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'zul_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 545932, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'zul_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 545874, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'zul_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 530241, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'zul_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 509559, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'zul_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 567693, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'zul_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 422970, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'zul_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 592015, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'zul_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 523345, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'zul_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 528853, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'zul_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 497535, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'zul_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 525822, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'zul_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 573820, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'zul_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 349210, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}}}} | | [NYSJudicialEthicsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [NaijaSenti](https://github.com/hausanlp/NaijaSenti) | ['hau', 'ibo', 'pcm', 'yor'] | Classification | s2s | [Social, Written] | None | None | +| [NamaaMrTydiReranking](https://huggingface.co/NAMAA-Space) (Muennighoff et al., 2022) | ['ara'] | Reranking | s2s | [Encyclopaedic, Written] | None | None | | [NarrativeQARetrieval](https://metatext.io/datasets/narrativeqa) (Tomáš Kočiský, 2017) | ['eng'] | Retrieval | s2p | | None | None | | [NepaliNewsClassification](https://github.com/goru001/nlp-for-nepali) | ['nep'] | Classification | s2s | [News, Written] | None | None | | [NeuCLIR2022Retrieval](https://neuclir.github.io/) (Lawrie et al., 2023) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | None | @@ -672,7 +673,7 @@ The following tables give you an overview of the tasks in MTEB. | apu | Apurinã | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | apw | Western Apache | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | apz | Safeyoka | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ara | Arabic | Unclassified | 2 | 12 | 0 | 0 | 0 | 2 | 1 | 9 | 2 | 0 | 0 | 28 | +| ara | Arabic | Unclassified | 2 | 12 | 0 | 0 | 0 | 2 | 2 | 9 | 2 | 0 | 0 | 29 | | arb | Standard Arabic | Afro-Asiatic | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 8 | | are | Western Arrarnta | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | arl | Arabela | Zaparoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1661,7 +1662,7 @@ The following tables give you an overview of the tasks in MTEB. | zty | Yatee Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zul | Zulu | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | | zyp | Zyphe Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| Total | None | None | None | 1394 | 795 | 304 | 3 | 28 | 67 | 50 | 460 | 85 | 2 | 2 | +| Total | None | None | None | 1394 | 795 | 304 | 3 | 28 | 67 | 51 | 460 | 85 | 2 | 2 | From 9c0b208eed58170485f89879a986e4ab411ff531 Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 11 Dec 2024 23:26:46 +0000 Subject: [PATCH 05/31] 1.24.1 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 956ca50631..0ea64b9d14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.24.0" +version = "1.24.1" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 373db747d807c3f2597269ac9abf50291673764d Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Fri, 13 Dec 2024 15:42:01 +0500 Subject: [PATCH 06/31] fix: Eval langs not correctly passed to monolingual tasks (#1587) * fix SouthAfricanLangClassification.py * add check for langs * lint --- .../multilingual/HinDialectClassification.py | 46 +++++++++---------- .../SouthAfricanLangClassification.py | 26 +++++------ tests/test_TaskMetadata.py | 13 ++++++ 3 files changed, 49 insertions(+), 36 deletions(-) diff --git a/mteb/tasks/Classification/multilingual/HinDialectClassification.py b/mteb/tasks/Classification/multilingual/HinDialectClassification.py index 6565d4b71a..c9d6b36669 100644 --- a/mteb/tasks/Classification/multilingual/HinDialectClassification.py +++ b/mteb/tasks/Classification/multilingual/HinDialectClassification.py @@ -3,29 +3,29 @@ from mteb.abstasks.AbsTaskClassification import AbsTaskClassification from mteb.abstasks.TaskMetadata import TaskMetadata -_LANGUAGES = { - "pan": ["pan-Guru"], - "bgc": ["bgc-Deva"], - "mag": ["mag-Deva"], - "bns": ["bns-Deva"], - "kfq": ["kfg-Deva"], - "noe": ["noe-Deva"], - "bhb": ["bhb-Deva"], - "bho": ["bho-Deva"], - "gbm": ["gbm-Deva"], - "mup": ["mup-Deva"], - "anp": ["anp-Deva"], - "hne": ["hne-Deva"], - "bra": ["bra-Deva"], - "raj": ["raj-Deva"], - "awa": ["awa-Deva"], - "guj": ["guj-Gujr"], - "ben": ["ben-Beng"], - "bhd": ["bhd-Deva"], - "kfy": ["kfy-Deva"], - "mar": ["mar-Deva"], - "bjj": ["bjj-Deva"], -} +_LANGUAGES = [ + "pan-Guru", + "bgc-Deva", + "mag-Deva", + "bns-Deva", + "kfg-Deva", + "noe-Deva", + "bhb-Deva", + "bho-Deva", + "gbm-Deva", + "mup-Deva", + "anp-Deva", + "hne-Deva", + "bra-Deva", + "raj-Deva", + "awa-Deva", + "guj-Gujr", + "ben-Beng", + "bhd-Deva", + "kfy-Deva", + "mar-Deva", + "bjj-Deva", +] class HinDialectClassification(AbsTaskClassification): diff --git a/mteb/tasks/Classification/multilingual/SouthAfricanLangClassification.py b/mteb/tasks/Classification/multilingual/SouthAfricanLangClassification.py index 4cef2c0604..217d300ec0 100644 --- a/mteb/tasks/Classification/multilingual/SouthAfricanLangClassification.py +++ b/mteb/tasks/Classification/multilingual/SouthAfricanLangClassification.py @@ -3,19 +3,19 @@ from mteb.abstasks.AbsTaskClassification import AbsTaskClassification from mteb.abstasks.TaskMetadata import TaskMetadata -_LANGUAGES = { - "afr": ["afr-Latn"], - "eng": ["eng-Latn"], - "nbl": ["nbl-Latn"], - "nso": ["nso-Latn"], - "sot": ["sot-Latn"], - "ssw": ["ssw-Latn"], - "tsn": ["tsn-Latn"], - "tso": ["tso-Latn"], - "ven": ["ven-Latn"], - "xho": ["xho-Latn"], - "zul": ["zul-Latn"], -} +_LANGUAGES = [ + "afr-Latn", + "eng-Latn", + "nbl-Latn", + "nso-Latn", + "sot-Latn", + "ssw-Latn", + "tsn-Latn", + "tso-Latn", + "ven-Latn", + "xho-Latn", + "zul-Latn", +] class SouthAfricanLangClassification(AbsTaskClassification): diff --git a/tests/test_TaskMetadata.py b/tests/test_TaskMetadata.py index 91ef4aabea..2b606c2c19 100644 --- a/tests/test_TaskMetadata.py +++ b/tests/test_TaskMetadata.py @@ -4,6 +4,7 @@ import pytest +from mteb import AbsTask from mteb.abstasks.TaskMetadata import TaskMetadata from mteb.overview import get_tasks @@ -1095,3 +1096,15 @@ def test_empy_descriptive_stat_in_new_datasets(): assert ( task.metadata.name not in exceptions ), f"Dataset {task.metadata.name} should have descriptive stats" + + +@pytest.mark.parametrize("task", get_tasks()) +def test_eval_langs_correctly_specified(task: AbsTask): + if task.is_multilingual: + assert isinstance( + task.metadata.eval_langs, dict + ), f"{task.metadata.name} should have eval_langs as a dict" + else: + assert isinstance( + task.metadata.eval_langs, list + ), f"{task.metadata.name} should have eval_langs as a list" From eecc9f1192bdf7cb63bd7151359629cae0b87fd3 Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 13 Dec 2024 10:55:51 +0000 Subject: [PATCH 07/31] 1.24.2 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0ea64b9d14..7e173d947f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.24.1" +version = "1.24.2" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From fdfdaeff8597707a70b79e1ff0b0cb5b63a97b01 Mon Sep 17 00:00:00 2001 From: Sam <40773225+sam-hey@users.noreply.github.com> Date: Sat, 14 Dec 2024 10:37:32 +0100 Subject: [PATCH 08/31] feat: Add ColBert (#1563) * feat: add max_sim operator for IR tasks to support multi-vector models * docs: add doc for Model2VecWrapper.__init__(...) * feat: add ColBERTWrapper to models & add ColBERTv2 * fix: resolve issues * fix: resolve issues * Update README.md Co-authored-by: Roman Solomatin * Update README.md Co-authored-by: Isaac Chung * Update README.md Co-authored-by: Isaac Chung * Update mteb/evaluation/evaluators/RetrievalEvaluator.py Co-authored-by: Isaac Chung * Update README.md Co-authored-by: Isaac Chung * README.md: rm subset * doc: update example for Late Interaction * get colbert running without errors * fix: pass is_query to pylate * fix: max_sim add pad_sequence * feat: integrate Jinja templates for ColBERTv2 and add model prompt handling * feat: add revision & prompt_name * doc: pad_sequence * rm TODO jina colbert v2 * doc: warning: higher resource usage for MaxSim --------- Co-authored-by: sam021313 <40773225+sam021313@users.noreply.github.com> Co-authored-by: Roman Solomatin Co-authored-by: Isaac Chung --- README.md | 22 ++ .../evaluators/RetrievalEvaluator.py | 8 +- mteb/evaluation/evaluators/utils.py | 28 +++ mteb/model_meta.py | 4 +- mteb/models/colbert_models.py | 191 ++++++++++++++++++ mteb/models/overview.py | 2 + pyproject.toml | 1 + 7 files changed, 254 insertions(+), 2 deletions(-) create mode 100644 mteb/models/colbert_models.py diff --git a/README.md b/README.md index d105a7aeb8..ab711a5947 100644 --- a/README.md +++ b/README.md @@ -319,6 +319,28 @@ evaluation.run(
Saving retrieval task predictions +### Using Late Interaction models for retrieval + +```python +from mteb import MTEB +import mteb + + +colbert = mteb.get_model("colbert-ir/colbertv2.0") +tasks = mteb.get_tasks(tasks=["NFCorpus"], languages=["eng"]) + +eval_splits = ["test"] + +evaluation = MTEB(tasks=tasks) + +evaluation.run( + colbert, + eval_splits=eval_splits, + corpus_chunk_size=500, +) +``` +This implementation uses the MaxSim operation to calculate the similarity between the sentences. Because of this, the number of embeddings processed is higher, which might lead to higher resource usage. Therefore, you can lower the `corpus_chunk_size` to reduce the load on resources. + ### Saving retrieval task predictions To save the predictions from a retrieval task, add the `--save_predictions` flag in the CLI or set `save_predictions=True` in the run method. The filename will be in the "{task_name}_{subset}_predictions.json" format. diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py index 8ec28c14ef..42dbb06b0f 100644 --- a/mteb/evaluation/evaluators/RetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py @@ -25,6 +25,7 @@ dot_score, download, hole, + max_sim, mrr, nAUC, recall_cap, @@ -77,10 +78,15 @@ def __init__( if "convert_to_tensor" not in encode_kwargs: encode_kwargs["convert_to_tensor"] = True - self.score_functions = {"cos_sim": cos_sim, "dot": dot_score} + self.score_functions = { + "cos_sim": cos_sim, + "dot": dot_score, + "max_sim": max_sim, + } self.score_function_desc = { "cos_sim": "Cosine Similarity", "dot": "Dot Product", + "max_sim": "Max Similarity", } self.corpus_chunk_size = corpus_chunk_size if isinstance(previous_results, Path): diff --git a/mteb/evaluation/evaluators/utils.py b/mteb/evaluation/evaluators/utils.py index 95d84bd2f2..787447198d 100644 --- a/mteb/evaluation/evaluators/utils.py +++ b/mteb/evaluation/evaluators/utils.py @@ -53,6 +53,34 @@ def dot_score(a: torch.Tensor, b: torch.Tensor): return torch.mm(a, b.transpose(0, 1)) +def max_sim(a: list, b: list): + """Computes the max-similarity max_sim(a[i], b[j]) for all i and j. + Works with a Tensor of the shape (batch_size, num_tokens, token_dim) + + Return: + Matrix with res[i][j] = max_sim(a[i], b[j]) + """ # noqa: D402 + if not isinstance(a, torch.Tensor): + a = torch.tensor(a, dtype=torch.float32) + + if not isinstance(b, torch.Tensor): + b = torch.tensor(b, dtype=torch.float32) + + if len(a.shape) == 2: + a = a.unsqueeze(0) + + if len(b.shape) == 2: + b = b.unsqueeze(0) + + scores = torch.einsum( + "ash,bth->abst", + a, + b, + ) + + return scores.max(axis=-1).values.sum(axis=-1) + + # From https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/retrieval/custom_metrics.py#L4 def mrr( qrels: dict[str, dict[str, int]], diff --git a/mteb/model_meta.py b/mteb/model_meta.py index 2cfc6df297..3993b0bb96 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -26,8 +26,10 @@ "API", "Tevatron", "NumPy", + "PyLate", + "ColBERT", ] -DISTANCE_METRICS = Literal["cosine"] +DISTANCE_METRICS = Literal["cosine", "max_sim"] def sentence_transformers_loader( diff --git a/mteb/models/colbert_models.py b/mteb/models/colbert_models.py new file mode 100644 index 0000000000..3b4b2edb51 --- /dev/null +++ b/mteb/models/colbert_models.py @@ -0,0 +1,191 @@ +from __future__ import annotations + +import logging +from collections.abc import Sequence +from functools import partial +from typing import Any + +import numpy as np +import torch + +from mteb.encoder_interface import PromptType +from mteb.model_meta import ModelMeta + +from .wrapper import Wrapper + +logger = logging.getLogger(__name__) + + +class ColBERTWrapper(Wrapper): + def __init__( + self, + model_name: str, + revision: str | None = None, + model_prompts: dict[str, str] | None = None, + **kwargs, + ) -> None: + """Wrapper for ColBERT models. + + Args: + model_name: The ColBERT model to load from HuggingFace Hub. + revision: The revision of the model to use. + model_prompts: A dictionary mapping task names to prompt names. + First priority is given to the composed prompt of task name + prompt type (query or passage), then to the specific task prompt, + then to the composed prompt of task type + prompt type, then to the specific task type prompt, + and finally to the specific prompt type. + **kwargs: Additional arguments to pass to the model. + """ + try: + from pylate import models as colbert_model + except ModuleNotFoundError as e: + raise ModuleNotFoundError( + "To use the ColBERT models `pylate` is required. Please install it with `pip install mteb[pylate]`." + ) from e + + self.model_name = model_name + self.model = colbert_model.ColBERT(self.model_name, revision=revision, **kwargs) + if ( + model_prompts is None + and hasattr(self.model, "prompts") + and len(self.model.prompts) > 0 + ): + try: + model_prompts = self.validate_task_to_prompt_name(self.model.prompts) + except ValueError: + model_prompts = None + elif model_prompts is not None and hasattr(self.model, "prompts"): + logger.info(f"Model prompts will be overwritten with {model_prompts}") + self.model.prompts = model_prompts + self.model_prompts = self.validate_task_to_prompt_name(model_prompts) + + def encode( + self, + sentences: Sequence[str], + *, + task_name: str, + prompt_type: PromptType | None = None, + **kwargs: Any, + ) -> np.ndarray: + """Encodes the given sentences using the encoder. + + Args: + sentences: The sentences to encode. + task_name: The name of the task. Pylate uses this to + determine which prompt to use from a specified dictionary. + prompt_type: The name type of prompt. (query or passage) + **kwargs: Additional arguments to pass to the encoder. + + The order of priorities for prompt selection are: + 1. Composed prompt of task name + prompt type (query or passage) + 2. Specific task prompt + 3. Composed prompt of task type + prompt type (query or passage) + 4. Specific task type prompt + 5. Specific prompt type (query or passage) + + Returns: + The encoded sentences as a numpy array. + """ + prompt_name = None + if self.model_prompts is not None: + prompt_name = self.get_prompt_name( + self.model_prompts, task_name, prompt_type + ) + if prompt_name: + logger.info( + f"Using prompt_name={prompt_name} for task={task_name} prompt_type={prompt_type}" + ) + else: + logger.info( + f"No model prompts found for task={task_name} prompt_type={prompt_type}" + ) + logger.info(f"Encoding {len(sentences)} sentences.") + + pred = self.model.encode( + sentences, + prompt_name=prompt_name, + is_query=True if prompt_type == PromptType.query else False, + **kwargs, + ) + + # encode returns a list of tensors shaped (x, token_dim) where x is the number of tokens in the sentence + # we need to pad these tensors to the same length + # Tensors have varying lengths; therefore, they need to be padded with zeros to ensure uniformity before being combined + # output shape will be (batch_size, len(max(tokens)), embedding_token_dim) + pred = torch.nn.utils.rnn.pad_sequence(pred, batch_first=True, padding_value=0) + + return pred.cpu().numpy() + + +colbert_v2 = ModelMeta( + loader=partial( + ColBERTWrapper, + model_name="colbert-ir/colbertv2.0", + ), + name="colbert-ir/colbertv2.0", + languages=["eng_Latn"], + open_weights=True, + revision="c1e84128e85ef755c096a95bdb06b47793b13acf", + public_training_code=True, + release_date="2024-09-21", + n_parameters=110 * 1e6, + max_tokens=180, # Reduced for Benchmarking - see ColBERT paper + embed_dim=None, # Bag of Embeddings (128) for each token + license="mit", + similarity_fn_name="max_sim", + framework=["PyLate", "ColBERT"], + reference="https://huggingface.co/colbert-ir/colbertv2.0", + use_instructions=False, + adapted_from=None, + superseded_by=None, +) + + +jina_colbert_v2 = ModelMeta( + loader=partial( + ColBERTWrapper, + model_name="jinaai/jina-colbert-v2", + query_prefix="[QueryMarker]", + document_prefix="[DocumentMarker]", + attend_to_expansion_tokens=True, + trust_remote_code=True, + ), + name="jinaai/jina-colbert-v2", + languages=[ # list of languages the model has been evaluated on + "ara-Arab", # Arabic + "ben-Beng", # Bengali + "deu-Latn", # German + "spa-Latn", # Spanish + "eng-Latn", # English + "fas-Arab", # Persian + "fin-Latn", # Finnish + "fra-Latn", # French + "hin-Deva", # Hindi + "ind-Latn", # Indonesian + "jpn-Jpan", # Japanese + "kor-Kore", # Korean + "rus-Cyrl", # Russian + "swa-Latn", # Swahili + "tel-Telu", # Telugu + "tha-Thai", # Thai + "yor-Latn", # Yoruba + "zho-Hans", # Chinese (Simplified) + "nld-Latn", # Dutch + "ita-Latn", # Italian + "por-Latn", # Portuguese + "vie-Latn", # Vietnamese + ], + open_weights=True, + revision="4cf816e5e2b03167b132a3c847a9ecd48ba708e1", + public_training_code=False, + release_date="2024-08-16", + n_parameters=559 * 1e6, + max_tokens=8192, + embed_dim=None, # Bag of Embeddings (128) for each token + license="cc-by-nc-4.0", + similarity_fn_name="max_sim", + framework=["PyLate", "ColBERT"], + reference="https://huggingface.co/jinaai/jina-colbert-v2", + use_instructions=False, + adapted_from=None, + superseded_by=None, +) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index f1b90f6c05..acf2b06f75 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -15,6 +15,7 @@ bge_models, bm25, cohere_models, + colbert_models, e5_instruct, e5_models, google_models, @@ -46,6 +47,7 @@ bge_models, bm25, cohere_models, + colbert_models, e5_instruct, e5_models, google_models, diff --git a/pyproject.toml b/pyproject.toml index 7e173d947f..e76a012b51 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,6 +63,7 @@ jina = ["einops>=0.8.0"] flash_attention = ["flash-attn>=2.6.3"] openai = ["openai>=1.41.0", "tiktoken>=0.8.0"] model2vec = ["model2vec>=0.3.0"] +pylate = ["pylate>=1.1.4"] [tool.coverage.report] From b466051b952ade5bb60e0f25321fed0ebb73cb50 Mon Sep 17 00:00:00 2001 From: github-actions Date: Sat, 14 Dec 2024 09:55:55 +0000 Subject: [PATCH 09/31] 1.25.0 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e76a012b51..ca8e8db850 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.24.2" +version = "1.25.0" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 992b20b7079bba96fee913181e28292757c0087b Mon Sep 17 00:00:00 2001 From: Sam <40773225+sam-hey@users.noreply.github.com> Date: Sun, 15 Dec 2024 01:11:11 +0100 Subject: [PATCH 10/31] doc: colbert add score_function & doc section (#1592) * doc: colbert add score_function & doc section * doc: Update README.md Co-authored-by: Kenneth Enevoldsen * doc: Update README.md Co-authored-by: Isaac Chung --------- Co-authored-by: sam021313 <40773225+sam021313@users.noreply.github.com> Co-authored-by: Kenneth Enevoldsen Co-authored-by: Isaac Chung --- README.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ab711a5947..a477d7b420 100644 --- a/README.md +++ b/README.md @@ -317,7 +317,7 @@ evaluation.run(
- Saving retrieval task predictions + Late Interaction (ColBERT) ### Using Late Interaction models for retrieval @@ -336,10 +336,17 @@ evaluation = MTEB(tasks=tasks) evaluation.run( colbert, eval_splits=eval_splits, + score_function="max_sim", corpus_chunk_size=500, ) ``` -This implementation uses the MaxSim operation to calculate the similarity between the sentences. Because of this, the number of embeddings processed is higher, which might lead to higher resource usage. Therefore, you can lower the `corpus_chunk_size` to reduce the load on resources. +This implementation employs the MaxSim operation to compute the similarity between sentences. While MaxSim provides high-quality results, it processes a larger number of embeddings, potentially leading to increased resource usage. To manage resource consumption, consider lowering the `corpus_chunk_size` parameter. + + +
+ +
+ Saving retrieval task predictions ### Saving retrieval task predictions From 8e6ee46408ca359833033b81aeec2be132cbfa0d Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Mon, 16 Dec 2024 04:32:02 +0500 Subject: [PATCH 11/31] Feat: add support for scoring function (#1594) * add support for scoring function * lint * move similarity to wrapper * remove score function * lint * remove from InstructionRetrievalEvaluator * Update mteb/evaluation/evaluators/RetrievalEvaluator.py Co-authored-by: Kenneth Enevoldsen * remove score function from README.md --------- Co-authored-by: Kenneth Enevoldsen --- README.md | 1 - .../InstructionRetrievalEvaluator.py | 2 - .../evaluators/RetrievalEvaluator.py | 60 +++++++------------ mteb/evaluation/evaluators/utils.py | 28 --------- mteb/model_meta.py | 2 +- mteb/models/colbert_models.py | 27 +++++++++ mteb/models/sentence_transformer_wrapper.py | 3 + 7 files changed, 51 insertions(+), 72 deletions(-) diff --git a/README.md b/README.md index a477d7b420..ca57669da4 100644 --- a/README.md +++ b/README.md @@ -336,7 +336,6 @@ evaluation = MTEB(tasks=tasks) evaluation.run( colbert, eval_splits=eval_splits, - score_function="max_sim", corpus_chunk_size=500, ) ``` diff --git a/mteb/evaluation/evaluators/InstructionRetrievalEvaluator.py b/mteb/evaluation/evaluators/InstructionRetrievalEvaluator.py index f17dad9872..154717d9f1 100644 --- a/mteb/evaluation/evaluators/InstructionRetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/InstructionRetrievalEvaluator.py @@ -34,7 +34,6 @@ def __call__( corpus, queries, self.top_k, - self.score_function, task_name=self.task_name, # type: ignore instructions=instructions, **kwargs, @@ -44,7 +43,6 @@ def __call__( corpus, queries, self.top_k, - self.score_function, instructions=instructions, request_qid=qid, task_name=self.task_name, diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py index 42dbb06b0f..70f26e2236 100644 --- a/mteb/evaluation/evaluators/RetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py @@ -22,10 +22,8 @@ confidence_scores, convert_conv_history_to_query, cos_sim, - dot_score, download, hole, - max_sim, mrr, nAUC, recall_cap, @@ -78,16 +76,6 @@ def __init__( if "convert_to_tensor" not in encode_kwargs: encode_kwargs["convert_to_tensor"] = True - self.score_functions = { - "cos_sim": cos_sim, - "dot": dot_score, - "max_sim": max_sim, - } - self.score_function_desc = { - "cos_sim": "Cosine Similarity", - "dot": "Dot Product", - "max_sim": "Max Similarity", - } self.corpus_chunk_size = corpus_chunk_size if isinstance(previous_results, Path): self.previous_results = str(previous_results) @@ -112,21 +100,12 @@ def search( corpus: dict[str, dict[str, str]], queries: dict[str, str | list[str]], top_k: int, - score_function: str, task_name: str, instructions: dict[str, str] | None = None, request_qid: str | None = None, return_sorted: bool = False, **kwargs, ) -> dict[str, dict[str, float]]: - # Create embeddings for all queries using model.encode - # Runs semantic search against the corpus embeddings - # Returns a ranked list with the corpus ids - if score_function not in self.score_functions: - raise ValueError( - f"score function: {score_function} must be either (cos_sim) for cosine similarity or (dot) for dot product" - ) - logger.info("Encoding Queries.") query_ids = list(queries.keys()) self.results = {qid: {} for qid in query_ids} @@ -156,9 +135,6 @@ def search( corpus = [corpus[cid] for cid in corpus_ids] # type: ignore logger.info("Encoding Corpus in batches... Warning: This might take a while!") - logger.info( - f"Scoring Function: {self.score_function_desc[score_function]} ({score_function})" - ) itr = range(0, len(corpus), self.corpus_chunk_size) @@ -190,35 +166,43 @@ def search( if self.save_corpus_embeddings and request_qid: self.corpus_embeddings[request_qid].append(sub_corpus_embeddings) - # Compute similarites using either cosine-similarity or dot product - cos_scores = self.score_functions[score_function]( - query_embeddings, sub_corpus_embeddings - ) - is_nan = torch.isnan(cos_scores) + # Compute similarites using self defined similarity otherwise default to cosine-similarity + similarity_scores = cos_sim(query_embeddings, sub_corpus_embeddings) + if hasattr(self.model, "similarity"): + similarity_scores = self.model.similarity( + float(self.model.similarity(e1, e2)) + for e1, e2 in zip(query_embeddings, sub_corpus_embeddings) + ) + is_nan = torch.isnan(similarity_scores) if is_nan.sum() > 0: logger.warning( f"Found {is_nan.sum()} NaN values in the similarity scores. Replacing NaN values with -1." ) - cos_scores[is_nan] = -1 + similarity_scores[is_nan] = -1 # Get top-k values - cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk( - cos_scores, + similarity_scores_top_k_values, similarity_scores_top_k_idx = torch.topk( + similarity_scores, min( top_k + 1, - len(cos_scores[1]) if len(cos_scores) > 1 else len(cos_scores[-1]), + len(similarity_scores[1]) + if len(similarity_scores) > 1 + else len(similarity_scores[-1]), ), dim=1, largest=True, sorted=return_sorted, ) - cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist() - cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist() + similarity_scores_top_k_values = ( + similarity_scores_top_k_values.cpu().tolist() + ) + similarity_scores_top_k_idx = similarity_scores_top_k_idx.cpu().tolist() for query_itr in range(len(query_embeddings)): query_id = query_ids[query_itr] for sub_corpus_id, score in zip( - cos_scores_top_k_idx[query_itr], cos_scores_top_k_values[query_itr] + similarity_scores_top_k_idx[query_itr], + similarity_scores_top_k_values[query_itr], ): corpus_id = corpus_ids[corpus_start_idx + sub_corpus_id] if len(result_heaps[query_id]) < top_k: @@ -447,7 +431,6 @@ def __init__( retriever, task_name: str | None = None, k_values: list[int] = [1, 3, 5, 10, 20, 100, 1000], - score_function: str = "cos_sim", encode_kwargs: dict[str, Any] = {}, **kwargs, ): @@ -469,7 +452,6 @@ def __init__( self.top_k = ( max(k_values) if "top_k" not in kwargs else kwargs["top_k"] ) # can lower it if reranking - self.score_function = score_function self.task_name = task_name def __call__( @@ -490,7 +472,6 @@ def __call__( corpus, queries, self.top_k, - self.score_function, task_name=self.task_name, # type: ignore ) else: @@ -498,7 +479,6 @@ def __call__( corpus, queries, self.top_k, - self.score_function, task_name=self.task_name, # type: ignore ) diff --git a/mteb/evaluation/evaluators/utils.py b/mteb/evaluation/evaluators/utils.py index 787447198d..95d84bd2f2 100644 --- a/mteb/evaluation/evaluators/utils.py +++ b/mteb/evaluation/evaluators/utils.py @@ -53,34 +53,6 @@ def dot_score(a: torch.Tensor, b: torch.Tensor): return torch.mm(a, b.transpose(0, 1)) -def max_sim(a: list, b: list): - """Computes the max-similarity max_sim(a[i], b[j]) for all i and j. - Works with a Tensor of the shape (batch_size, num_tokens, token_dim) - - Return: - Matrix with res[i][j] = max_sim(a[i], b[j]) - """ # noqa: D402 - if not isinstance(a, torch.Tensor): - a = torch.tensor(a, dtype=torch.float32) - - if not isinstance(b, torch.Tensor): - b = torch.tensor(b, dtype=torch.float32) - - if len(a.shape) == 2: - a = a.unsqueeze(0) - - if len(b.shape) == 2: - b = b.unsqueeze(0) - - scores = torch.einsum( - "ash,bth->abst", - a, - b, - ) - - return scores.max(axis=-1).values.sum(axis=-1) - - # From https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/retrieval/custom_metrics.py#L4 def mrr( qrels: dict[str, dict[str, int]], diff --git a/mteb/model_meta.py b/mteb/model_meta.py index 3993b0bb96..fc0b473947 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -29,7 +29,7 @@ "PyLate", "ColBERT", ] -DISTANCE_METRICS = Literal["cosine", "max_sim"] +DISTANCE_METRICS = Literal["cosine", "max_sim", "dot"] def sentence_transformers_loader( diff --git a/mteb/models/colbert_models.py b/mteb/models/colbert_models.py index 3b4b2edb51..8753791bff 100644 --- a/mteb/models/colbert_models.py +++ b/mteb/models/colbert_models.py @@ -115,6 +115,33 @@ def encode( return pred.cpu().numpy() + def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: + """Computes the max-similarity max_sim(a[i], b[j]) for all i and j. + Works with a Tensor of the shape (batch_size, num_tokens, token_dim) + + Return: + Matrix with res[i][j] = max_sim(a[i], b[j]) + """ # noqa: D402 + if not isinstance(a, torch.Tensor): + a = torch.tensor(a, dtype=torch.float32) + + if not isinstance(b, torch.Tensor): + b = torch.tensor(b, dtype=torch.float32) + + if len(a.shape) == 2: + a = a.unsqueeze(0) + + if len(b.shape) == 2: + b = b.unsqueeze(0) + + scores = torch.einsum( + "ash,bth->abst", + a, + b, + ) + + return scores.max(axis=-1).values.sum(axis=-1) + colbert_v2 = ModelMeta( loader=partial( diff --git a/mteb/models/sentence_transformer_wrapper.py b/mteb/models/sentence_transformer_wrapper.py index 13d39e4031..763fa7e154 100644 --- a/mteb/models/sentence_transformer_wrapper.py +++ b/mteb/models/sentence_transformer_wrapper.py @@ -56,6 +56,9 @@ def __init__( if isinstance(self.model, CrossEncoder): self.predict = self._predict + if hasattr(self.model, "similarity"): + self.similarity = self.model.similarity + def encode( self, sentences: Sequence[str], From 95d5ae5897c0e8671dd6ba0bd9f8a7db1fb930be Mon Sep 17 00:00:00 2001 From: Alexey Vatolin Date: Mon, 16 Dec 2024 11:12:10 +0100 Subject: [PATCH 12/31] Add new models nvidia, gte, linq (#1436) * Add new models nvidia, gte, linq * add warning for gte-Qwen and nvidia models re: instruction used in docs as well --------- Co-authored-by: isaac-chung --- mteb/models/bge_models.py | 6 +- mteb/models/cohere_models.py | 4 +- mteb/models/e5_instruct.py | 11 ++- mteb/models/e5_models.py | 14 ++-- mteb/models/gritlm_models.py | 4 +- mteb/models/gte_models.py | 66 ++++++++++++++- mteb/models/instruct_wrapper.py | 5 ++ mteb/models/jina_models.py | 2 +- mteb/models/linq_models.py | 40 +++++++++ mteb/models/mxbai_models.py | 2 +- mteb/models/nvidia_models.py | 115 ++++++++++++++++++++++++++ mteb/models/overview.py | 20 +++-- mteb/models/rerankers_custom.py | 6 +- mteb/models/rerankers_monot5_based.py | 28 +++---- mteb/models/ru_sentence_models.py | 4 +- mteb/models/salesforce_models.py | 43 ++++++++-- mteb/models/stella_models.py | 4 +- mteb/models/uae_models.py | 2 +- mteb/models/voyage_models.py | 14 ++-- 19 files changed, 319 insertions(+), 71 deletions(-) create mode 100644 mteb/models/linq_models.py create mode 100644 mteb/models/nvidia_models.py diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index 5ab4294795..b643b4dfb6 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -7,7 +7,7 @@ model_prompts = {"query": "Represent this sentence for searching relevant passages: "} bge_small_en_v1_5 = ModelMeta( - loader=partial( + loader=partial( # type: ignore sentence_transformers_loader, model_name="BAAI/bge-small-en-v1.5", revision="5c38ec7c405ec4b44b94cc5a9bb96e735b38267a", @@ -30,7 +30,7 @@ ) bge_base_en_v1_5 = ModelMeta( - loader=partial( + loader=partial( # type: ignore sentence_transformers_loader, model_name="BAAI/bge-base-en-v1.5", revision="a5beb1e3e68b9ab74eb54cfd186867f64f240e1a", @@ -53,7 +53,7 @@ ) bge_large_en_v1_5 = ModelMeta( - loader=partial( + loader=partial( # type: ignore sentence_transformers_loader, model_name="BAAI/bge-large-en-v1.5", revision="d4aa6901d3a41ba39fb536a557fa166f842b0e09", diff --git a/mteb/models/cohere_models.py b/mteb/models/cohere_models.py index 3f07a0d23b..2a8aa1e3d3 100644 --- a/mteb/models/cohere_models.py +++ b/mteb/models/cohere_models.py @@ -216,7 +216,7 @@ def encode( } cohere_mult_3 = ModelMeta( - loader=partial( + loader=partial( # type: ignore CohereTextEmbeddingModel, model_name="embed-multilingual-v3.0", model_prompts=model_prompts, @@ -238,7 +238,7 @@ def encode( ) cohere_eng_3 = ModelMeta( - loader=partial( + loader=partial( # type: ignore CohereTextEmbeddingModel, model_name="embed-english-v3.0", model_prompts=model_prompts, diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py index 5d5b1f3ad6..f26d78ed6d 100644 --- a/mteb/models/e5_instruct.py +++ b/mteb/models/e5_instruct.py @@ -12,15 +12,14 @@ MISTRAL_LANGUAGES = ["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"] -def e5_instruction(instruction: str) -> str: - return f"Instruct: {instruction}\nQuery: " +E5_INSTRUCTION = "Instruct: {instruction}\nQuery: " e5_instruct = ModelMeta( - loader=partial( + loader=partial( # type: ignore instruct_wrapper, model_name_or_path="intfloat/multilingual-e5-large-instruct", - instruction_template=e5_instruction, + instruction_template=E5_INSTRUCTION, attn="cccc", pooling_method="mean", mode="embedding", @@ -44,10 +43,10 @@ def e5_instruction(instruction: str) -> str: ) e5_mistral = ModelMeta( - loader=partial( + loader=partial( # type: ignore instruct_wrapper, model_name_or_path="intfloat/e5-mistral-7b-instruct", - instruction_template=e5_instruction, + instruction_template=E5_INSTRUCTION, attn="cccc", pooling_method="lasttoken", mode="embedding", diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py index 79b84b993f..4fee54de79 100644 --- a/mteb/models/e5_models.py +++ b/mteb/models/e5_models.py @@ -114,7 +114,7 @@ } e5_mult_small = ModelMeta( - loader=partial( + loader=partial( # type: ignore sentence_transformers_loader, model_name="intfloat/multilingual-e5-small", revision="fd1525a9fd15316a2d503bf26ab031a61d056e98", @@ -137,7 +137,7 @@ ) e5_mult_base = ModelMeta( - loader=partial( + loader=partial( # type: ignore sentence_transformers_loader, model_name="intfloat/multilingual-e5-base", model_prompts=model_prompts, @@ -159,7 +159,7 @@ ) e5_mult_large = ModelMeta( - loader=partial( + loader=partial( # type: ignore sentence_transformers_loader, model_name="intfloat/multilingual-e5-large", revision="ab10c1a7f42e74530fe7ae5be82e6d4f11a719eb", @@ -182,7 +182,7 @@ ) e5_eng_small_v2 = ModelMeta( - loader=partial( + loader=partial( # type: ignore sentence_transformers_loader, model_name="intfloat/e5-small-v2", model_prompts=model_prompts, @@ -204,7 +204,7 @@ ) e5_eng_small = ModelMeta( - loader=partial( + loader=partial( # type: ignore sentence_transformers_loader, model_name="intfloat/e5-small", revision="e272f3049e853b47cb5ca3952268c6662abda68f", @@ -227,7 +227,7 @@ ) e5_eng_base_v2 = ModelMeta( - loader=partial( + loader=partial( # type: ignore sentence_transformers_loader, model_name="intfloat/e5-base-v2", revision="1c644c92ad3ba1efdad3f1451a637716616a20e8", @@ -252,7 +252,7 @@ ) e5_eng_large_v2 = ModelMeta( - loader=partial( + loader=partial( # type: ignore sentence_transformers_loader, model_name="intfloat/e5-large-v2", revision="b322e09026e4ea05f42beadf4d661fb4e101d311", diff --git a/mteb/models/gritlm_models.py b/mteb/models/gritlm_models.py index b1c4882bc2..1e9c85eb83 100644 --- a/mteb/models/gritlm_models.py +++ b/mteb/models/gritlm_models.py @@ -17,7 +17,7 @@ def gritlm_instruction(instruction: str = "") -> str: gritlm7b = ModelMeta( - loader=partial( + loader=partial( # type: ignore instruct_wrapper, model_name_or_path="GritLM/GritLM-7B", instruction_template=gritlm_instruction, @@ -40,7 +40,7 @@ def gritlm_instruction(instruction: str = "") -> str: use_instructions=True, ) gritlm8x7b = ModelMeta( - loader=partial( + loader=partial( # type: ignore instruct_wrapper, model_name_or_path="GritLM/GritLM-8x7B", instruction_template=gritlm_instruction, diff --git a/mteb/models/gte_models.py b/mteb/models/gte_models.py index 2358ef6d5f..301821b6e0 100644 --- a/mteb/models/gte_models.py +++ b/mteb/models/gte_models.py @@ -3,14 +3,18 @@ from functools import partial from mteb.model_meta import ModelMeta +from mteb.models.instruct_wrapper import instruct_wrapper + + +def instruction_template(instruction: str) -> str: + return f"Instruct: {instruction}\nQuery: " if instruction else "" -from .instruct_wrapper import instruct_wrapper gte_Qwen2_7B_instruct = ModelMeta( - loader=partial( + loader=partial( # type: ignore instruct_wrapper, model_name_or_path="Alibaba-NLP/gte-Qwen2-7B-instruct", - instruction_template="Instruct: {instruction}\nQuery: ", + instruction_template=instruction_template, attn="cccc", pooling_method="lasttoken", mode="embedding", @@ -33,3 +37,59 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, ) + + +gte_Qwen1_5_7B_instruct = ModelMeta( + loader=partial( # type: ignore + instruct_wrapper, + model_name_or_path="Alibaba-NLP/gte-Qwen1.5-7B-instruct", + instruction_template=instruction_template, + attn="cccc", + pooling_method="lasttoken", + mode="embedding", + torch_dtype="auto", + normalized=True, + ), + name="Alibaba-NLP/gte-Qwen1.5-7B-instruct", + languages=["eng_Latn"], + open_weights=True, + revision="07d27e5226328010336563bc1b564a5e3436a298", + release_date="2024-04-20", # initial commit of hf model. + n_parameters=7_720_000_000, + memory_usage=None, + embed_dim=4096, + license="apache-2.0", + max_tokens=32768, + reference="https://huggingface.co/Alibaba-NLP/gte-Qwen1.5-7B-instruct", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, +) + + +gte_Qwen2_1_5B_instruct = ModelMeta( + loader=partial( # type: ignore + instruct_wrapper, + model_name_or_path="Alibaba-NLP/gte-Qwen2-1.5B-instruct", + instruction_template=instruction_template, + attn="cccc", + pooling_method="lasttoken", + mode="embedding", + torch_dtype="auto", + normalized=True, + ), + name="Alibaba-NLP/gte-Qwen2-1.5B-instruct", + languages=["eng_Latn"], + open_weights=True, + revision="c6c1b92f4a3e1b92b326ad29dd3c8433457df8dd", + release_date="2024-07-29", # initial commit of hf model. + n_parameters=1_780_000_000, + memory_usage=None, + embed_dim=8960, + license="apache-2.0", + max_tokens=131072, + reference="https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, +) diff --git a/mteb/models/instruct_wrapper.py b/mteb/models/instruct_wrapper.py index 30c173c779..d6b5cc0388 100644 --- a/mteb/models/instruct_wrapper.py +++ b/mteb/models/instruct_wrapper.py @@ -47,6 +47,11 @@ def __init__( "No instruction template provided. Instructions will be used as-is." ) + if "gte-Qwen" in model_name_or_path: + logger.warning( + "Instructions are used in both query and docs, which may cause performance discrepancies from the original implementation." + ) + self.instruction_template = instruction_template super().__init__(model_name_or_path=model_name_or_path, mode=mode, **kwargs) diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index 08eb6cb63d..7b2c343a1d 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -191,7 +191,7 @@ def encode( jina_embeddings_v3 = ModelMeta( - loader=partial( + loader=partial( # type: ignore JinaWrapper, model="jinaai/jina-embeddings-v3", revision="215a6e121fa0183376388ac6b1ae230326bfeaed", diff --git a/mteb/models/linq_models.py b/mteb/models/linq_models.py new file mode 100644 index 0000000000..48e86ac8d5 --- /dev/null +++ b/mteb/models/linq_models.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from functools import partial + +import torch + +from mteb.model_meta import ModelMeta +from mteb.models.instruct_wrapper import instruct_wrapper + + +def instruction_template(instruction: str) -> str: + return f"Instruct: {instruction}\nQuery: " if instruction else "" + + +Linq_Embed_Mistral = ModelMeta( + loader=partial( # type: ignore + instruct_wrapper, + model_name_or_path="Linq-AI-Research/Linq-Embed-Mistral", + instruction_template=instruction_template, + attn="cccc", + pooling_method="lasttoken", + mode="embedding", + torch_dtype=torch.bfloat16, + normalized=True, + ), + name="Linq-AI-Research/Linq-Embed-Mistral", + languages=["eng_Latn"], + open_weights=True, + revision="0c1a0b0589177079acc552433cad51d7c9132379", + release_date="2024-05-29", # initial commit of hf model. + n_parameters=7_110_000_000, + memory_usage=None, + embed_dim=4096, + license="cc-by-nc-4.0", + max_tokens=32768, + reference="https://huggingface.co/Linq-AI-Research/Linq-Embed-Mistral", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, +) diff --git a/mteb/models/mxbai_models.py b/mteb/models/mxbai_models.py index ce7d1808bd..5dfb9dc42a 100644 --- a/mteb/models/mxbai_models.py +++ b/mteb/models/mxbai_models.py @@ -5,7 +5,7 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader mxbai_embed_large_v1 = ModelMeta( - loader=partial( + loader=partial( # type: ignore sentence_transformers_loader, model_name="mixedbread-ai/mxbai-embed-large-v1", revision="990580e27d329c7408b3741ecff85876e128e203", diff --git a/mteb/models/nvidia_models.py b/mteb/models/nvidia_models.py new file mode 100644 index 0000000000..0c0170de6e --- /dev/null +++ b/mteb/models/nvidia_models.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +import logging +from collections.abc import Sequence +from functools import partial +from typing import Any + +import numpy as np +import torch +from sentence_transformers import CrossEncoder, SentenceTransformer + +from mteb.encoder_interface import PromptType +from mteb.model_meta import ModelMeta +from mteb.models.sentence_transformer_wrapper import SentenceTransformerWrapper + +logger = logging.getLogger(__name__) + + +def instruction_template(instruction: str) -> str: + return f"Instruct: {instruction}\nQuery: " if instruction else "" + + +class NvEmbedWrapper(SentenceTransformerWrapper): + def __init__( + self, + model: str | SentenceTransformer | CrossEncoder, + revision: str | None = None, + model_prompts: dict[str, str] | None = None, + **kwargs, + ) -> None: + super().__init__(model, revision, model_prompts, **kwargs) + self.model.max_seq_length = 32768 + self.model.tokenizer.padding_side = "right" + logger.warning( + "Instructions are used in both query and docs, which may cause performance discrepancies from the original implementation." + ) + + def encode( + self, + sentences: Sequence[str], + *, + task_name: str, + prompt_type: PromptType | None = None, + **kwargs: Any, + ) -> np.ndarray: + # Add eos token to each input example + sentences = [example + self.model.tokenizer.eos_token for example in sentences] + + instruction = "" + if prompt_type == PromptType.query: + instruction = self.get_instruction(task_name, prompt_type) + + prompt = instruction_template(instruction) + + if prompt: + logger.info(f"Using {prompt=} for task={task_name} {prompt_type=}") + else: + logger.info(f"No model prompts found for task={task_name} {prompt_type=}") + + logger.info(f"Encoding {len(sentences)} sentences.") + + embeddings = self.model.encode( + sentences, + prompt=prompt, + normalize_embeddings=True, + **kwargs, + ) + if isinstance(embeddings, torch.Tensor): + embeddings = embeddings.cpu().detach().float().numpy() + return embeddings + + +NV_embed_v2 = ModelMeta( + loader=partial( # type: ignore + NvEmbedWrapper, + model="nvidia/NV-Embed-v2", + trust_remote_code=True, + ), + name="nvidia/NV-Embed-v2", + languages=["eng_Latn"], + open_weights=True, + revision="7604d305b621f14095a1aa23d351674c2859553a", + release_date="2024-09-09", # initial commit of hf model. + n_parameters=7_850_000_000, + memory_usage=None, + embed_dim=4096, + license="cc-by-nc-4.0", + max_tokens=32768, + reference="https://huggingface.co/nvidia/NV-Embed-v2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, +) + +NV_embed_v1 = ModelMeta( + loader=partial( # type: ignore + NvEmbedWrapper, + model="nvidia/NV-Embed-v1", + trust_remote_code=True, + ), + name="nvidia/NV-Embed-v1", + languages=["eng_Latn"], + open_weights=True, + revision="570834afd5fef5bf3a3c2311a2b6e0a66f6f4f2c", + release_date="2024-09-13", # initial commit of hf model. + n_parameters=7_850_000_000, + memory_usage=None, + embed_dim=4096, + license="cc-by-nc-4.0", + max_tokens=32768, + reference="https://huggingface.co/nvidia/NV-Embed-v1", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, +) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index acf2b06f75..f93c8862ff 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -22,11 +22,13 @@ gritlm_models, gte_models, jina_models, + linq_models, llm2vec_models, misc_models, model2vec_models, mxbai_models, nomic_models, + nvidia_models, openai_models, promptriever_models, repllama_models, @@ -51,26 +53,28 @@ e5_instruct, e5_models, google_models, + google_models, gritlm_models, gte_models, + jina_models, + linq_models, llm2vec_models, mxbai_models, model2vec_models, misc_models, nomic_models, + nvidia_models, openai_models, + promptriever_models, + repllama_models, + rerankers_custom, + rerankers_monot5_based, ru_sentence_models, salesforce_models, sentence_transformers_models, - voyage_models, - google_models, - repllama_models, - promptriever_models, - jina_models, - uae_models, stella_models, - rerankers_monot5_based, - rerankers_custom, + uae_models, + voyage_models, ] MODEL_REGISTRY = {} diff --git a/mteb/models/rerankers_custom.py b/mteb/models/rerankers_custom.py index dc354a550c..40977f1e04 100644 --- a/mteb/models/rerankers_custom.py +++ b/mteb/models/rerankers_custom.py @@ -175,7 +175,7 @@ def loader_inner(**kwargs: Any) -> Encoder: monobert_large = ModelMeta( - loader=partial( + loader=partial( # type: ignore _loader, wrapper=MonoBERTReranker, model_name_or_path="castorini/monobert-large-msmarco", @@ -190,7 +190,7 @@ def loader_inner(**kwargs: Any) -> Encoder: # languages unclear: https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual/discussions/28 jina_reranker_multilingual = ModelMeta( - loader=partial( + loader=partial( # type: ignore _loader, wrapper=JinaReranker, model_name_or_path="jinaai/jina-reranker-v2-base-multilingual", @@ -204,7 +204,7 @@ def loader_inner(**kwargs: Any) -> Encoder: ) bge_reranker_v2_m3 = ModelMeta( - loader=partial( + loader=partial( # type: ignore _loader, wrapper=BGEReranker, model_name_or_path="BAAI/bge-reranker-v2-m3", diff --git a/mteb/models/rerankers_monot5_based.py b/mteb/models/rerankers_monot5_based.py index aef4a19e7e..b96897ee51 100644 --- a/mteb/models/rerankers_monot5_based.py +++ b/mteb/models/rerankers_monot5_based.py @@ -276,7 +276,7 @@ def get_prediction_tokens(self, *args, **kwargs): monot5_small = ModelMeta( - loader=partial( + loader=partial( # type: ignore _loader, wrapper=MonoT5Reranker, model_name_or_path="castorini/monot5-small-msmarco-10k", @@ -290,7 +290,7 @@ def get_prediction_tokens(self, *args, **kwargs): ) monot5_base = ModelMeta( - loader=partial( + loader=partial( # type: ignore _loader, wrapper=MonoT5Reranker, model_name_or_path="castorini/monot5-base-msmarco-10k", @@ -304,7 +304,7 @@ def get_prediction_tokens(self, *args, **kwargs): ) monot5_large = ModelMeta( - loader=partial( + loader=partial( # type: ignore _loader, wrapper=MonoT5Reranker, model_name_or_path="castorini/monot5-large-msmarco-10k", @@ -318,7 +318,7 @@ def get_prediction_tokens(self, *args, **kwargs): ) monot5_3b = ModelMeta( - loader=partial( + loader=partial( # type: ignore _loader, wrapper=MonoT5Reranker, model_name_or_path="castorini/monot5-3b-msmarco-10k", @@ -332,7 +332,7 @@ def get_prediction_tokens(self, *args, **kwargs): ) flant5_base = ModelMeta( - loader=partial( + loader=partial( # type: ignore _loader, wrapper=FLANT5Reranker, model_name_or_path="google/flan-t5-base", @@ -346,7 +346,7 @@ def get_prediction_tokens(self, *args, **kwargs): ) flant5_large = ModelMeta( - loader=partial( + loader=partial( # type: ignore _loader, wrapper=FLANT5Reranker, model_name_or_path="google/flan-t5-large", @@ -360,7 +360,7 @@ def get_prediction_tokens(self, *args, **kwargs): ) flant5_xl = ModelMeta( - loader=partial( + loader=partial( # type: ignore _loader, wrapper=FLANT5Reranker, model_name_or_path="google/flan-t5-xl", @@ -374,7 +374,7 @@ def get_prediction_tokens(self, *args, **kwargs): ) flant5_xxl = ModelMeta( - loader=partial( + loader=partial( # type: ignore _loader, wrapper=FLANT5Reranker, model_name_or_path="google/flan-t5-xxl", @@ -389,7 +389,7 @@ def get_prediction_tokens(self, *args, **kwargs): llama2_7b = ModelMeta( - loader=partial( + loader=partial( # type: ignore _loader, wrapper=LlamaReranker, model_name_or_path="meta-llama/Llama-2-7b-hf", @@ -403,7 +403,7 @@ def get_prediction_tokens(self, *args, **kwargs): ) llama2_7b_chat = ModelMeta( - loader=partial( + loader=partial( # type: ignore _loader, wrapper=LlamaReranker, model_name_or_path="meta-llama/Llama-2-7b-chat-hf", @@ -417,7 +417,7 @@ def get_prediction_tokens(self, *args, **kwargs): ) mistral_7b = ModelMeta( - loader=partial( + loader=partial( # type: ignore _loader, wrapper=MistralReranker, model_name_or_path="mistralai/Mistral-7B-Instruct-v0.2", @@ -431,7 +431,7 @@ def get_prediction_tokens(self, *args, **kwargs): ) followir_7b = ModelMeta( - loader=partial( + loader=partial( # type: ignore _loader, wrapper=FollowIRReranker, model_name_or_path="jhu-clsp/FollowIR-7B", @@ -550,7 +550,7 @@ def get_prediction_tokens(self, *args, **kwargs): ] mt5_base_mmarco_v2 = ModelMeta( - loader=partial( + loader=partial( # type: ignore _loader, wrapper=MonoT5Reranker, model_name_or_path="unicamp-dl/mt5-base-mmarco-v2", @@ -564,7 +564,7 @@ def get_prediction_tokens(self, *args, **kwargs): ) mt5_13b_mmarco_100k = ModelMeta( - loader=partial( + loader=partial( # type: ignore _loader, wrapper=MonoT5Reranker, model_name_or_path="unicamp-dl/mt5-13b-mmarco-100k", diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index cfe8965164..301892e070 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -75,7 +75,7 @@ ) user_base_ru = ModelMeta( - loader=partial( + loader=partial( # type: ignore sentence_transformers_loader, model_name="deepvk/USER-base", revision="436a489a2087d61aa670b3496a9915f84e46c861", @@ -218,7 +218,7 @@ rosberta_ru_en = ModelMeta( - loader=partial( + loader=partial( # type: ignore sentence_transformers_loader, model_name="ai-forever/ru-en-RoSBERTa", revision="89fb1651989adbb1cfcfdedafd7d102951ad0555", diff --git a/mteb/models/salesforce_models.py b/mteb/models/salesforce_models.py index eabc4352a0..e5c0973d5f 100644 --- a/mteb/models/salesforce_models.py +++ b/mteb/models/salesforce_models.py @@ -2,26 +2,23 @@ from functools import partial -import torch - from mteb.model_meta import ModelMeta - -from .instruct_wrapper import instruct_wrapper +from mteb.models.instruct_wrapper import instruct_wrapper -def sfr_instruction(instruction: str) -> str: - return f"Instruct: {instruction}\nQuery: " +def instruction_template(instruction: str) -> str: + return f"Instruct: {instruction}\nQuery: " if instruction else "" SFR_Embedding_2_R = ModelMeta( - loader=partial( + loader=partial( # type: ignore instruct_wrapper, model_name_or_path="Salesforce/SFR-Embedding-2_R", - instruction_template=sfr_instruction, + instruction_template=instruction_template, attn="cccc", pooling_method="lasttoken", mode="embedding", - torch_dtype=torch.bfloat16, + torch_dtype="auto", # The ST script does not normalize while the HF one does so unclear what to do # https://huggingface.co/Salesforce/SFR-Embedding-2_R normalized=True, @@ -41,3 +38,31 @@ def sfr_instruction(instruction: str) -> str: framework=["Sentence Transformers", "PyTorch"], use_instructions=True, ) + + +SFR_Embedding_Mistral = ModelMeta( + loader=partial( # type: ignore + instruct_wrapper, + model_name_or_path="Salesforce/SFR-Embedding-Mistral", + instruction_template=instruction_template, + attn="cccc", + pooling_method="lasttoken", + mode="embedding", + torch_dtype="auto", + normalized=True, + ), + name="Salesforce/SFR-Embedding-Mistral", + languages=["eng_Latn"], + open_weights=True, + revision="938c560d1c236aa563b2dbdf084f28ab28bccb11", + release_date="2024-01-24", # initial commit of hf model. + n_parameters=7_110_000_000, + memory_usage=None, + embed_dim=4096, + license="cc-by-nc-4.0", + max_tokens=32768, + reference="https://huggingface.co/Salesforce/SFR-Embedding-Mistral", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, +) diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index 8fc19fd06d..153ee6aa99 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -7,7 +7,7 @@ stella_en_400M = ModelMeta( # https://huggingface.co/dunzhang/stella_en_400M_v5/discussions/21#671a6205ac1e2416090f2bf4 - loader=partial( + loader=partial( # type: ignore instruct_wrapper, model_name_or_path="dunzhang/stella_en_400M_v5", attn="cccc", @@ -31,7 +31,7 @@ ) stella_en_1_5b = ModelMeta( - loader=partial( + loader=partial( # type: ignore instruct_wrapper, model_name_or_path="dunzhang/stella_en_1.5B_v5", attn="cccc", diff --git a/mteb/models/uae_models.py b/mteb/models/uae_models.py index b18240b47c..942d508949 100644 --- a/mteb/models/uae_models.py +++ b/mteb/models/uae_models.py @@ -52,7 +52,7 @@ def encode( uae_large_v1 = ModelMeta( - loader=partial( + loader=partial( # type: ignore UAEWrapper, model="WhereIsAI/UAE-Large-V1", revision="369c368f70f16a613f19f5598d4f12d9f44235d4", diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index 9f42808b37..57453b0762 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -145,7 +145,7 @@ def _batched_encode( revision="1", release_date="2024-05-05", languages=None, # supported languages not specified - loader=partial( + loader=partial( # type: ignore VoyageWrapper, model_name="voyage-large-2-instruct", model_prompts=model_prompts, @@ -167,7 +167,7 @@ def _batched_encode( revision="1", release_date="2024-05-30", languages=None, # supported languages not specified - loader=partial( + loader=partial( # type: ignore VoyageWrapper, model_name="voyage-finance-2", model_prompts=model_prompts, @@ -189,7 +189,7 @@ def _batched_encode( revision="1", release_date="2024-04-15", languages=None, # supported languages not specified - loader=partial( + loader=partial( # type: ignore VoyageWrapper, model_name="voyage-law-2", model_prompts=model_prompts, @@ -211,7 +211,7 @@ def _batched_encode( revision="1", release_date="2024-01-23", languages=None, # supported languages not specified - loader=partial( + loader=partial( # type: ignore VoyageWrapper, model_name="voyage-code-2", model_prompts=model_prompts, @@ -233,7 +233,7 @@ def _batched_encode( revision="1", release_date="2023-10-29", languages=None, # supported languages not specified - loader=partial( + loader=partial( # type: ignore VoyageWrapper, model_name="voyage-large-2", model_prompts=model_prompts, @@ -255,7 +255,7 @@ def _batched_encode( revision="1", release_date="2023-10-29", languages=None, # supported languages not specified - loader=partial( + loader=partial( # type: ignore VoyageWrapper, model_name="voyage-2", model_prompts=model_prompts, @@ -276,7 +276,7 @@ def _batched_encode( revision="1", release_date="2024-06-10", languages=None, # supported languages not specified - loader=partial( + loader=partial( # type: ignore VoyageWrapper, model_name="voyage-multilingual-2", model_prompts=model_prompts, From 0c9e046eb1cc81b4780ef5c02e9e04f9a68521c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Mon, 16 Dec 2024 16:52:18 +0100 Subject: [PATCH 13/31] Leaderboard: Refined plots (#1601) * Added embedding size guide to performance-size plot, removed shading on radar chart * Changed plot names to something more descriptive * Made plots failsafe --- mteb/leaderboard/app.py | 4 +- mteb/leaderboard/figures.py | 95 ++++++++++++++++++++++++++++++++++--- 2 files changed, 90 insertions(+), 9 deletions(-) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index a6dd1c7325..6838891e02 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -218,12 +218,12 @@ def update_task_info(task_names: str) -> gr.DataFrame: ) citation = gr.Markdown(update_citation, inputs=[benchmark_select]) with gr.Column(): - with gr.Tab("Performance-Size Plot"): + with gr.Tab("Performance per Model Size"): plot = gr.Plot(performance_size_plot, inputs=[summary_table]) gr.Markdown( "*We only display models that have been run on all tasks in the benchmark*" ) - with gr.Tab("Top 5 Radar Chart"): + with gr.Tab("Performance per Task Type (Radar Chart)"): radar_plot = gr.Plot(radar_chart, inputs=[summary_table]) gr.Markdown( "*We only display models that have been run on all task types in the benchmark*" diff --git a/mteb/leaderboard/figures.py b/mteb/leaderboard/figures.py index 9f3e73f7a4..35f91dd363 100644 --- a/mteb/leaderboard/figures.py +++ b/mteb/leaderboard/figures.py @@ -6,6 +6,28 @@ import plotly.graph_objects as go +def text_plot(text: str): + """Returns empty scatter plot with text added, this can be great for error messages.""" + return px.scatter(template="plotly_white").add_annotation( + text=text, showarrow=False, font=dict(size=20) + ) + + +def failsafe_plot(fun): + """Decorator that turns the function producing a figure failsafe. + This is necessary, because once a Callback encounters an exception it + becomes useless in Gradio. + """ + + def wrapper(*args, **kwargs): + try: + return fun(*args, **kwargs) + except Exception: + return text_plot("Couldn't produce plot.") + + return wrapper + + def parse_n_params(text: str) -> int: if text.endswith("M"): return float(text[:-1]) * 1e6 @@ -37,6 +59,48 @@ def parse_float(value) -> float: ] +def add_size_guide(fig: go.Figure): + xpos = [5 * 1e9] * 4 + ypos = [7.8, 8.5, 9, 10] + sizes = [256, 1024, 2048, 4096] + fig.add_trace( + go.Scatter( + showlegend=False, + opacity=0.3, + mode="markers", + marker=dict( + size=np.sqrt(sizes), + color="rgba(0,0,0,0)", + line=dict(color="black", width=2), + ), + x=xpos, + y=ypos, + ) + ) + fig.add_annotation( + text="Embedding Size:", + font=dict(size=16), + x=np.log10(1.5e9), + y=10, + showarrow=False, + opacity=0.3, + ) + for x, y, size in zip(xpos, np.linspace(7.5, 14, 4), sizes): + fig.add_annotation( + text=f"{size}", + font=dict(size=12), + x=np.log10(x), + y=y, + showarrow=True, + ay=0, + ax=50, + opacity=0.3, + arrowwidth=2, + ) + return fig + + +@failsafe_plot def performance_size_plot(df: pd.DataFrame) -> go.Figure: df = df.copy() df["Number of Parameters"] = df["Number of Parameters"].map(parse_n_params) @@ -50,6 +114,7 @@ def performance_size_plot(df: pd.DataFrame) -> go.Figure: if not len(df.index): return go.Figure() min_score, max_score = df["Mean (Task)"].min(), df["Mean (Task)"].max() + df["sqrt(dim)"] = np.sqrt(df["Embedding Dimensions"]) fig = px.scatter( df, x="Number of Parameters", @@ -57,7 +122,7 @@ def performance_size_plot(df: pd.DataFrame) -> go.Figure: log_x=True, template="plotly_white", text="model_text", - size="Embedding Dimensions", + size="sqrt(dim)", color="Log(Tokens)", range_color=[2, 5], range_x=[8 * 1e6, 11 * 1e9], @@ -69,10 +134,21 @@ def performance_size_plot(df: pd.DataFrame) -> go.Figure: "Mean (Task)": True, "Rank (Borda)": True, "Log(Tokens)": False, + "sqrt(dim)": False, "model_text": False, }, hover_name="Model", ) + # Note: it's important that this comes before setting the size mode + fig = add_size_guide(fig) + fig.update_traces( + marker=dict( + sizemode="diameter", + sizeref=1.5, + sizemin=0, + ) + ) + fig.add_annotation(x=1e9, y=10, text="Model size:") fig.update_layout( coloraxis_colorbar=dict( # noqa title="Max Tokens", @@ -124,14 +200,15 @@ def performance_size_plot(df: pd.DataFrame) -> go.Figure: "#3CBBB1", ] fill_colors = [ - "rgba(238,66,102,0.2)", - "rgba(0,166,237,0.2)", - "rgba(236,167,44,0.2)", - "rgba(180,35,24,0.2)", - "rgba(60,187,177,0.2)", + "rgba(238,66,102,0.05)", + "rgba(0,166,237,0.05)", + "rgba(236,167,44,0.05)", + "rgba(180,35,24,0.05)", + "rgba(60,187,177,0.05)", ] +@failsafe_plot def radar_chart(df: pd.DataFrame) -> go.Figure: df = df.copy() df["Model"] = df["Model"].map(parse_model_name) @@ -139,6 +216,10 @@ def radar_chart(df: pd.DataFrame) -> go.Figure: task_type_columns = [ column for column in df.columns if "".join(column.split()) in task_types ] + if len(task_type_columns) <= 1: + raise ValueError( + "Couldn't produce radar chart, the benchmark only contains one task category." + ) df = df[["Model", *task_type_columns]].set_index("Model") df = df.replace("", np.nan) df = df.dropna() @@ -156,7 +237,7 @@ def radar_chart(df: pd.DataFrame) -> go.Figure: mode="lines", line=dict(width=2, color=line_colors[i]), fill="toself", - fillcolor=fill_colors[i], + fillcolor="rgba(0,0,0,0)", ) ) fig.update_layout( From 6ecc86ff2f6fc0ea83332cb9a454df8c7e178ddd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Mon, 16 Dec 2024 20:48:36 +0100 Subject: [PATCH 14/31] fix: Leaderboard refinements (#1603) * Added explanation of aggregate measures * Added download button to result tables * Task info gets sorted by task name * Added custom, shareable links for each benchmark * Moved explanation of aggregate metrics to the summary tab --- mteb/leaderboard/app.py | 54 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 6838891e02..7cc658d0cb 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -1,10 +1,13 @@ from __future__ import annotations import json +import tempfile from collections import defaultdict from pathlib import Path +from urllib.parse import urlencode import gradio as gr +import pandas as pd from gradio_rangeslider import RangeSlider import mteb @@ -24,6 +27,30 @@ def load_results(): return mteb.BenchmarkResults.from_validated(**json.load(cache_file)) +def produce_benchmark_link(benchmark_name: str, request: gr.Request) -> str: + """Produces a URL for the selected benchmark.""" + params = urlencode( + { + "benchmark_name": benchmark_name, + } + ) + base_url = request.request.base_url + url = f"{base_url}?{params}" + md = f"```\n{url}\n```" + return md + + +def set_benchmark_on_load(request: gr.Request): + query_params = request.query_params + return query_params.get("benchmark_name", "MTEB(Multilingual, beta)") + + +def download_table(table: pd.DataFrame) -> Path: + file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv") + table.to_csv(file) + return file.name + + def update_citation(benchmark_name: str) -> str: benchmark = mteb.get_benchmark(benchmark_name) if str(benchmark.citation) != "None": @@ -66,6 +93,7 @@ def update_task_info(task_names: str) -> gr.DataFrame: properties=["name", "type", "languages", "domains", "reference", "main_score"] ) df["languages"] = df["languages"].map(format_list) + df = df.sort_values("name") df["domains"] = df["domains"].map(format_list) df["name"] = "[" + df["name"] + "](" + df["reference"] + ")" df = df.rename( @@ -217,6 +245,8 @@ def update_task_info(task_names: str) -> gr.DataFrame: inputs=[benchmark_select, lang_select, type_select, domain_select], ) citation = gr.Markdown(update_citation, inputs=[benchmark_select]) + with gr.Accordion("Share this benchmark:", open=False): + gr.Markdown(produce_benchmark_link, inputs=[benchmark_select]) with gr.Column(): with gr.Tab("Performance per Model Size"): plot = gr.Plot(performance_size_plot, inputs=[summary_table]) @@ -229,12 +259,36 @@ def update_task_info(task_names: str) -> gr.DataFrame: "*We only display models that have been run on all task types in the benchmark*" ) with gr.Tab("Summary"): + with gr.Accordion( + "What do aggregate measures (Rank(Borda), Mean(Task), etc.) mean?", + open=False, + ): + gr.Markdown( + """ + **Rank(borda)** is computed based on the [borda count](https://en.wikipedia.org/wiki/Borda_count), where each task is treated as a preference voter, which gives votes on the models in accordance with their relative performance on the task. The best model obtains the highest number of votes. The model with the highest number of votes across tasks obtains the highest rank. The Borda rank tends to prefer models that perform well broadly across tasks. However, given that it is a rank it can be unclear if the two models perform similarly. + + **Mean(Task)**: This is a naïve average computed across all the tasks within the benchmark. This score is simple to understand and is continuous as opposed to the Borda rank. However, the mean can overvalue tasks with higher variance in its scores. + + **Mean(TaskType)**: This is a weighted average across different task categories, such as classification or retrieval. It is computed by first computing the average by task category and then computing the average on each category. Similar to the Mean(Task) this measure is continuous and tends to overvalue tasks with higher variance. This score also prefers models that perform well across all task categories. + """ + ) summary_table.render() + download_summary = gr.DownloadButton("Download Table") + download_summary.click( + download_table, inputs=[summary_table], outputs=[download_summary] + ) with gr.Tab("Performance per task"): per_task_table.render() + download_per_task = gr.DownloadButton("Download Table") + download_per_task.click( + download_table, inputs=[per_task_table], outputs=[download_per_task] + ) with gr.Tab("Task information"): task_info_table = gr.DataFrame(update_task_info, inputs=[task_select]) + # This sets the benchmark from the URL query parameters + demo.load(set_benchmark_on_load, inputs=[], outputs=[benchmark_select]) + @gr.on(inputs=[scores, searchbar], outputs=[summary_table, per_task_table]) def update_tables(scores, search_query: str): summary, per_task = scores_to_tables(scores, search_query) From 5e9c468513b6a3f139d55980f0b2ce40c0c672f6 Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 16 Dec 2024 20:04:35 +0000 Subject: [PATCH 15/31] 1.25.1 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ca8e8db850..dece91213a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.25.0" +version = "1.25.1" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From b81b584ceb1bd8a42a676482edcc19c90de75cb1 Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Tue, 17 Dec 2024 04:02:51 +0500 Subject: [PATCH 16/31] Feat: Use similarity scores if available (#1602) * Use similarity scores if available * lint --- .../evaluators/BitextMiningEvaluator.py | 29 ++++++++++++++----- .../evaluators/RerankingEvaluator.py | 27 +++++++++++------ 2 files changed, 39 insertions(+), 17 deletions(-) diff --git a/mteb/evaluation/evaluators/BitextMiningEvaluator.py b/mteb/evaluation/evaluators/BitextMiningEvaluator.py index 4fa7022ed6..4b068653da 100644 --- a/mteb/evaluation/evaluators/BitextMiningEvaluator.py +++ b/mteb/evaluation/evaluators/BitextMiningEvaluator.py @@ -62,7 +62,7 @@ def compute_metrics(self, model: Encoder, encode_kwargs: dict[str, Any] = {}): tqdm.tqdm(self.pairs, desc="Matching sentences") ): scores[f"{key1}-{key2}"] = self._compute_metrics( - embeddings[key1], embeddings[key2] + embeddings[key1], embeddings[key2], model ) # in case of default pair unnest the dict @@ -76,10 +76,13 @@ def _compute_metrics( self, embeddings1, embeddings2, + model: Encoder, ): # Find nearest neighbors logger.info("Finding nearest neighbors...") - nearest_neighbors = self._similarity_search(embeddings1, embeddings2, top_k=1) + nearest_neighbors = self._similarity_search( + embeddings1, embeddings2, model, top_k=1 + ) # Compute errors logger.info("Computing metrics...") @@ -106,10 +109,10 @@ def _similarity_search( self, query_embeddings, corpus_embeddings, + model: Encoder, query_chunk_size: int = 100, corpus_chunk_size: int = 500000, top_k: int = 10, - score_function=cos_sim, ): """This function performs a cosine similarity search between a list of query embeddings and a list of corpus embeddings. It can be used for Information Retrieval / Semantic Search for corpora up to about 1 Million entries. @@ -117,10 +120,10 @@ def _similarity_search( Args: query_embeddings: A 2 dimensional tensor with the query embeddings. corpus_embeddings: A 2 dimensional tensor with the corpus embeddings. + model: The model used to encode the queries and corpus. This is used to check if the embeddings are on the same device and to encode the queries and corpus if they are not already tensors. query_chunk_size: Process 100 queries simultaneously. Increasing that value increases the speed, but requires more memory. corpus_chunk_size: Scans the corpus 100k entries at a time. Increasing that value increases the speed, but requires more memory. top_k: Retrieve top k matching entries. - score_function: Function for computing scores. By default, cosine similarity. Returns: Returns a list with one entry for each query. Each entry is a list of dictionaries with the keys 'corpus_id' and 'score', sorted by decreasing cosine similarity scores. @@ -142,7 +145,7 @@ def _similarity_search( # Iterate over chunks of the corpus for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size): # Compute cosine similarities - cos_scores = score_function( + similarity_scores = cos_sim( query_embeddings[ query_start_idx : query_start_idx + query_chunk_size ], @@ -151,10 +154,20 @@ def _similarity_search( ], ) + if hasattr(model, "similarity"): + similarity_scores = model.similarity( + query_embeddings[ + query_start_idx : query_start_idx + query_chunk_size + ], + corpus_embeddings[ + corpus_start_idx : corpus_start_idx + corpus_chunk_size + ], + ) + # Get top-k scores cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk( - cos_scores, - min(top_k, len(cos_scores[0])), + similarity_scores, + min(top_k, len(similarity_scores[0])), dim=1, largest=True, sorted=False, @@ -162,7 +175,7 @@ def _similarity_search( cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist() cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist() - for query_itr in range(len(cos_scores)): + for query_itr in range(len(similarity_scores)): for sub_corpus_id, score in zip( cos_scores_top_k_idx[query_itr], cos_scores_top_k_values[query_itr], diff --git a/mteb/evaluation/evaluators/RerankingEvaluator.py b/mteb/evaluation/evaluators/RerankingEvaluator.py index 62d741ee0c..3df204f86c 100644 --- a/mteb/evaluation/evaluators/RerankingEvaluator.py +++ b/mteb/evaluation/evaluators/RerankingEvaluator.py @@ -34,7 +34,6 @@ def __init__( task_name: str | None = None, mrr_at_k: int = 10, name: str = "", - similarity_fct=cos_sim, encode_kwargs: dict[str, Any] = {}, use_batched_encoding: bool = True, limit: int | None = None, @@ -48,7 +47,6 @@ def __init__( self.samples = samples self.name = name self.mrr_at_k = mrr_at_k - self.similarity_fct = similarity_fct self.use_batched_encoding = use_batched_encoding self.task_name = task_name self.k_values = k_values @@ -211,6 +209,7 @@ def _encode_candidates_batched( all_mrr_scores, all_ap_scores, all_conf_scores, + model, ) def _encode_candidates_individual( @@ -257,6 +256,7 @@ def _encode_candidates_individual( all_mrr_scores, all_ap_scores, all_conf_scores, + model, ) def _collect_results(self, all_mrr_scores, all_ap_scores, all_conf_scores): @@ -316,7 +316,7 @@ def _encode_candidates_miracl_batched(self, all_query_embs, model: Encoder): docs_idx += num_doc fake_qid = str(query_idx) - results[fake_qid] = self.rerank(query_emb, docs_emb) + results[fake_qid] = self.rerank(query_emb, docs_emb, model) qrels[fake_qid] = { str(i): 1 if doc in positive else 0 for i, doc in enumerate(docs) } @@ -351,7 +351,7 @@ def _encode_candidates_miracl_individual(self, model: Encoder): ) fake_qid = str(i) - results[fake_qid] = self.rerank(query_emb, docs_emb) + results[fake_qid] = self.rerank(query_emb, docs_emb, model) qrels[fake_qid] = { str(i): 1 if doc in positive else 0 for i, doc in enumerate(docs) } @@ -371,7 +371,7 @@ def _collect_miracl_results(self, results, qrels): return scores_miracl def rerank( - self, query_emb: torch.Tensor, docs_emb: torch.Tensor + self, query_emb: np.ndarray, docs_emb: np.ndarray, model: Encoder ) -> dict[str, float]: """Rerank documents (docs_emb) given the query (query_emb) @@ -379,6 +379,7 @@ def rerank( query_emb: Query embedding of shape `(num_queries, hidden_size)`) if `num_queries` > 0: we take the closest document to any of the queries docs_emb: Candidates documents embeddings of shape `(num_pos+num_neg, hidden_size)`) + model: Model to use for computing similarity scores if model.similarity is available Returns: similarity_scores: @@ -389,7 +390,10 @@ def rerank( if not docs_emb.shape[0]: return {"empty-docid": 0} - pred_scores = self.similarity_fct(query_emb, docs_emb) + if hasattr(model, "similarity"): + pred_scores = model.similarity(query_emb, docs_emb) + else: + pred_scores = cos_sim(query_emb, docs_emb) if len(pred_scores.shape) > 1: pred_scores = torch.amax(pred_scores, dim=0) @@ -405,8 +409,9 @@ def _apply_sim_scores( all_mrr_scores, all_ap_scores, all_conf_scores, + model: Encoder, ): - sim_scores = self._compute_sim_scores_instance(query_emb, docs_emb) + sim_scores = self._compute_sim_scores_instance(query_emb, docs_emb, model) scores = self._compute_metrics_instance(sim_scores, is_relevant) conf_scores = self.conf_scores(sim_scores.tolist()) @@ -443,7 +448,7 @@ def _encode_unique_texts( return all_unique_texts_embs[all_texts_indexes] def _compute_sim_scores_instance( - self, query_emb: torch.Tensor, docs_emb: torch.Tensor + self, query_emb: np.ndarray, docs_emb: np.ndarray, model: Encoder ) -> torch.Tensor: """Computes similarity scores for a single instance = (query, positives, negatives) @@ -451,11 +456,15 @@ def _compute_sim_scores_instance( query_emb: Query embedding, with shape `(num_queries, hidden_size)` if `num_queries` > 0: we take the closest document to any of the queries docs_emb: Candidates documents embeddings, with shape `(num_pos+num_neg, hidden_size)` + model: Model to use for computing similarity scores if model.similarity is available Returns: sim_scores: Query-documents similarity scores, with shape `(num_pos+num_neg,)` """ - sim_scores = self.similarity_fct(query_emb, docs_emb) + if hasattr(model, "similarity"): + sim_scores = model.similarity(query_emb, docs_emb) + else: + sim_scores = cos_sim(query_emb, docs_emb) if len(sim_scores.shape) > 1: sim_scores = torch.amax(sim_scores, dim=0) From 6731b94d0c7b1bc3a01329d12b3c5b03520ae102 Mon Sep 17 00:00:00 2001 From: KGupta10 <92774828+KGupta10@users.noreply.github.com> Date: Wed, 18 Dec 2024 09:26:53 -0800 Subject: [PATCH 17/31] Add NanoBEIR Datasets (#1588) * add NanoClimateFeverRetrieval task, still requires some debugging * move task to correct place in init file * add all Nano datasets and results * format code * Update mteb/tasks/Retrieval/eng/tempCodeRunnerFile.py Co-authored-by: Roman Solomatin * pin revision to commit and add datasets to benchmark.py * create new benchmark for NanoBEIR * add revision when loading datasets * lint --------- Co-authored-by: Roman Solomatin Co-authored-by: isaac-chung --- mteb/benchmarks/benchmarks.py | 24 +++++ mteb/tasks/Retrieval/__init__.py | 13 +++ .../Retrieval/eng/NanoArguAnaRetrieval.py | 85 +++++++++++++++ .../eng/NanoClimateFeverRetrieval.py | 85 +++++++++++++++ .../Retrieval/eng/NanoDBPediaRetrieval.py | 75 +++++++++++++ .../tasks/Retrieval/eng/NanoFEVERRetrieval.py | 99 +++++++++++++++++ .../Retrieval/eng/NanoFiQA2018Retrieval.py | 85 +++++++++++++++ .../Retrieval/eng/NanoHotpotQARetrieval.py | 102 ++++++++++++++++++ .../Retrieval/eng/NanoMSMARCORetrieval.py | 97 +++++++++++++++++ .../Retrieval/eng/NanoNFCorpusRetrieval.py | 87 +++++++++++++++ mteb/tasks/Retrieval/eng/NanoNQRetrieval.py | 83 ++++++++++++++ .../tasks/Retrieval/eng/NanoQuoraRetrieval.py | 86 +++++++++++++++ .../Retrieval/eng/NanoSCIDOCSRetrieval.py | 85 +++++++++++++++ .../Retrieval/eng/NanoSciFactRetrieval.py | 83 ++++++++++++++ .../Retrieval/eng/NanoTouche2020Retrieval.py | 94 ++++++++++++++++ 15 files changed, 1183 insertions(+) create mode 100644 mteb/tasks/Retrieval/eng/NanoArguAnaRetrieval.py create mode 100644 mteb/tasks/Retrieval/eng/NanoClimateFeverRetrieval.py create mode 100644 mteb/tasks/Retrieval/eng/NanoDBPediaRetrieval.py create mode 100644 mteb/tasks/Retrieval/eng/NanoFEVERRetrieval.py create mode 100644 mteb/tasks/Retrieval/eng/NanoFiQA2018Retrieval.py create mode 100644 mteb/tasks/Retrieval/eng/NanoHotpotQARetrieval.py create mode 100644 mteb/tasks/Retrieval/eng/NanoMSMARCORetrieval.py create mode 100644 mteb/tasks/Retrieval/eng/NanoNFCorpusRetrieval.py create mode 100644 mteb/tasks/Retrieval/eng/NanoNQRetrieval.py create mode 100644 mteb/tasks/Retrieval/eng/NanoQuoraRetrieval.py create mode 100644 mteb/tasks/Retrieval/eng/NanoSCIDOCSRetrieval.py create mode 100644 mteb/tasks/Retrieval/eng/NanoSciFactRetrieval.py create mode 100644 mteb/tasks/Retrieval/eng/NanoTouche2020Retrieval.py diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index d5efbc092f..e872143ee5 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -978,3 +978,27 @@ def load_results( year={2024} }""", ) + +NANOBEIR = Benchmark( + name="NanoBEIR", + tasks=get_tasks( + tasks=[ + "NanoArguAnaRetrieval", + "NanoClimateFeverRetrieval", + "NanoDBPediaRetrieval", + "NanoFEVERRetrieval", + "NanoFiQA2018Retrieval", + "NanoHotpotQARetrieval", + "NanoMSMARCORetrieval", + "NanoNFCorpusRetrieval", + "NanoNQRetrieval", + "NanoQuoraRetrieval", + "NanoSCIDOCSRetrieval", + "NanoSciFactRetrieval", + "NanoTouche2020Retrieval", + ], + ), + description="A benchmark to evaluate with subsets of BEIR datasets to use less computational power", + reference="https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6", + citation=None, +) diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index ca41d4354f..d83df7ec5e 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -64,6 +64,19 @@ from .eng.MLQuestions import * from .eng.MSMARCORetrieval import * from .eng.MSMARCOv2Retrieval import * +from .eng.NanoArguAnaRetrieval import * +from .eng.NanoClimateFeverRetrieval import * +from .eng.NanoDBPediaRetrieval import * +from .eng.NanoFEVERRetrieval import * +from .eng.NanoFiQA2018Retrieval import * +from .eng.NanoHotpotQARetrieval import * +from .eng.NanoMSMARCORetrieval import * +from .eng.NanoNFCorpusRetrieval import * +from .eng.NanoNQRetrieval import * +from .eng.NanoQuoraRetrieval import * +from .eng.NanoSCIDOCSRetrieval import * +from .eng.NanoSciFactRetrieval import * +from .eng.NanoTouche2020Retrieval import * from .eng.NarrativeQARetrieval import * from .eng.NFCorpusRetrieval import * from .eng.NQRetrieval import * diff --git a/mteb/tasks/Retrieval/eng/NanoArguAnaRetrieval.py b/mteb/tasks/Retrieval/eng/NanoArguAnaRetrieval.py new file mode 100644 index 0000000000..2230368b94 --- /dev/null +++ b/mteb/tasks/Retrieval/eng/NanoArguAnaRetrieval.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +from datasets import load_dataset + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class NanoArguAnaRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NanoArguAnaRetrieval", + description="NanoArguAna is a smaller subset of ArguAna, a dataset for argument retrieval in debate contexts.", + reference="http://argumentation.bplaced.net/arguana/data", + dataset={ + "path": "zeta-alpha-ai/NanoArguAna", + "revision": "8f4a982d470a32c45817738b9d29042ca55d75ad", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=["2020-01-01", "2020-12-31"], + domains=["Medical", "Written"], + task_subtypes=["Discourse coherence"], + license="cc-by-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="""@inproceedings{boteva2016, + author = {Boteva, Vera and Gholipour, Demian and Sokolov, Artem and Riezler, Stefan}, + title = {A Full-Text Learning to Rank Dataset for Medical Information Retrieval}, + journal = {Proceedings of the 38th European Conference on Information Retrieval}, + journal-abbrev = {ECIR}, + year = {2016}, + city = {Padova}, + country = {Italy}, + url = {http://www.cl.uni-heidelberg.de/~riezler/publications/papers/ECIR2016.pdf} +}""", + prompt={"query": "Given a claim, find documents that refute the claim"}, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus = load_dataset( + "zeta-alpha-ai/NanoArguAna", + "corpus", + revision="8f4a982d470a32c45817738b9d29042ca55d75ad", + ) + self.queries = load_dataset( + "zeta-alpha-ai/NanoArguAna", + "queries", + revision="8f4a982d470a32c45817738b9d29042ca55d75ad", + ) + self.relevant_docs = load_dataset( + "zeta-alpha-ai/NanoArguAna", + "qrels", + revision="8f4a982d470a32c45817738b9d29042ca55d75ad", + ) + + self.corpus = { + split: { + sample["_id"]: {"_id": sample["_id"], "text": sample["text"]} + for sample in self.corpus[split] + } + for split in self.corpus + } + + self.queries = { + split: {sample["_id"]: sample["text"] for sample in self.queries[split]} + for split in self.queries + } + + self.relevant_docs = { + split: { + sample["query-id"]: {sample["corpus-id"]: 1} + for sample in self.relevant_docs[split] + } + for split in self.relevant_docs + } + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/eng/NanoClimateFeverRetrieval.py b/mteb/tasks/Retrieval/eng/NanoClimateFeverRetrieval.py new file mode 100644 index 0000000000..0185a454d3 --- /dev/null +++ b/mteb/tasks/Retrieval/eng/NanoClimateFeverRetrieval.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +from datasets import load_dataset + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class NanoClimateFeverRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NanoClimateFeverRetrieval", + description="NanoClimateFever is a small version of the BEIR dataset adopting the FEVER methodology that consists of 1,535 real-world claims regarding climate-change.", + reference="https://arxiv.org/abs/2012.00614", + dataset={ + "path": "zeta-alpha-ai/NanoClimateFEVER", + "revision": "96741bfa30b9f56db8c9eb7d08e775ed6474f206", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=["2020-01-01", "2020-12-31"], + domains=["Non-fiction", "Academic", "News"], + task_subtypes=["Claim verification"], + license="cc-by-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="""@misc{diggelmann2021climatefever, + title={CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims}, + author={Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold}, + year={2021}, + eprint={2012.00614}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +}""", + prompt={ + "query": "Given a claim about climate change, retrieve documents that support or refute the claim" + }, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus = load_dataset( + "zeta-alpha-ai/NanoClimateFEVER", + "corpus", + revision="96741bfa30b9f56db8c9eb7d08e775ed6474f206", + ) + self.queries = load_dataset( + "zeta-alpha-ai/NanoClimateFEVER", + "queries", + revision="96741bfa30b9f56db8c9eb7d08e775ed6474f206", + ) + self.relevant_docs = load_dataset( + "zeta-alpha-ai/NanoClimateFEVER", + "qrels", + revision="96741bfa30b9f56db8c9eb7d08e775ed6474f206", + ) + + self.corpus = { + split: { + sample["_id"]: {"_id": sample["_id"], "text": sample["text"]} + for sample in self.corpus[split] + } + for split in self.corpus + } + + self.queries = { + split: {sample["_id"]: sample["text"] for sample in self.queries[split]} + for split in self.queries + } + + self.relevant_docs = { + split: { + sample["query-id"]: {sample["corpus-id"]: 1} + for sample in self.relevant_docs[split] + } + for split in self.relevant_docs + } + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/eng/NanoDBPediaRetrieval.py b/mteb/tasks/Retrieval/eng/NanoDBPediaRetrieval.py new file mode 100644 index 0000000000..caa638743c --- /dev/null +++ b/mteb/tasks/Retrieval/eng/NanoDBPediaRetrieval.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +from datasets import load_dataset + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class NanoDBPediaRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NanoDBPediaRetrieval", + description="NanoDBPediaRetrieval is a small version of the standard test collection for entity search over the DBpedia knowledge base.", + reference="https://huggingface.co/datasets/zeta-alpha-ai/NanoDBPedia", + dataset={ + "path": "zeta-alpha-ai/NanoDBPedia", + "revision": "438f1c25129f05db6238699b5afdc9c6b58d2096", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=["2015-01-01", "2015-12-31"], + domains=["Encyclopaedic"], + task_subtypes=["Topic classification"], + license="cc-by-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="""@article{lehmann2015dbpedia, title={DBpedia: A large-scale, multilingual knowledge base extracted from Wikipedia}, author={Lehmann, Jens and et al.}, journal={Semantic Web}, year={2015}}""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus = load_dataset( + "zeta-alpha-ai/NanoDBPedia", + "corpus", + revision="438f1c25129f05db6238699b5afdc9c6b58d2096", + ) + self.queries = load_dataset( + "zeta-alpha-ai/NanoDBPedia", + "queries", + revision="438f1c25129f05db6238699b5afdc9c6b58d2096", + ) + self.relevant_docs = load_dataset( + "zeta-alpha-ai/NanoDBPedia", + "qrels", + revision="438f1c25129f05db6238699b5afdc9c6b58d2096", + ) + + self.corpus = { + split: { + sample["_id"]: {"_id": sample["_id"], "text": sample["text"]} + for sample in self.corpus[split] + } + for split in self.corpus + } + + self.queries = { + split: {sample["_id"]: sample["text"] for sample in self.queries[split]} + for split in self.queries + } + + self.relevant_docs = { + split: { + sample["query-id"]: {sample["corpus-id"]: 1} + for sample in self.relevant_docs[split] + } + for split in self.relevant_docs + } + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/eng/NanoFEVERRetrieval.py b/mteb/tasks/Retrieval/eng/NanoFEVERRetrieval.py new file mode 100644 index 0000000000..6bdd0ab4cf --- /dev/null +++ b/mteb/tasks/Retrieval/eng/NanoFEVERRetrieval.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +from datasets import load_dataset + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class NanoFEVERRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NanoFEVERRetrieval", + description="NanoFEVER is a smaller version of " + + "FEVER (Fact Extraction and VERification), which consists of 185,445 claims generated by altering sentences" + + " extracted from Wikipedia and subsequently verified without knowledge of the sentence they were" + + " derived from.", + reference="https://fever.ai/", + dataset={ + "path": "zeta-alpha-ai/NanoFEVER", + "revision": "a8bfdf1bf15181167a7e22e69cf8754bdea9b4c8", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=["2018-01-01", "2018-12-31"], + domains=["Academic", "Encyclopaedic"], + task_subtypes=["Claim verification"], + license="cc-by-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="""@inproceedings{thorne-etal-2018-fever, + title = "{FEVER}: a Large-scale Dataset for Fact Extraction and {VER}ification", + author = "Thorne, James and + Vlachos, Andreas and + Christodoulopoulos, Christos and + Mittal, Arpit", + editor = "Walker, Marilyn and + Ji, Heng and + Stent, Amanda", + booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)", + month = jun, + year = "2018", + address = "New Orleans, Louisiana", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/N18-1074", + doi = "10.18653/v1/N18-1074", + pages = "809--819", + abstract = "In this paper we introduce a new publicly available dataset for verification against textual sources, FEVER: Fact Extraction and VERification. It consists of 185,445 claims generated by altering sentences extracted from Wikipedia and subsequently verified without knowledge of the sentence they were derived from. The claims are classified as Supported, Refuted or NotEnoughInfo by annotators achieving 0.6841 in Fleiss kappa. For the first two classes, the annotators also recorded the sentence(s) forming the necessary evidence for their judgment. To characterize the challenge of the dataset presented, we develop a pipeline approach and compare it to suitably designed oracles. The best accuracy we achieve on labeling a claim accompanied by the correct evidence is 31.87{\%}, while if we ignore the evidence we achieve 50.91{\%}. Thus we believe that FEVER is a challenging testbed that will help stimulate progress on claim verification against textual sources.", +}""", + prompt={ + "query": "Given a claim, retrieve documents that support or refute the claim" + }, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus = load_dataset( + "zeta-alpha-ai/NanoFEVER", + "corpus", + revision="a8bfdf1bf15181167a7e22e69cf8754bdea9b4c8", + ) + self.queries = load_dataset( + "zeta-alpha-ai/NanoFEVER", + "queries", + revision="a8bfdf1bf15181167a7e22e69cf8754bdea9b4c8", + ) + self.relevant_docs = load_dataset( + "zeta-alpha-ai/NanoFEVER", + "qrels", + revision="a8bfdf1bf15181167a7e22e69cf8754bdea9b4c8", + ) + + self.corpus = { + split: { + sample["_id"]: {"_id": sample["_id"], "text": sample["text"]} + for sample in self.corpus[split] + } + for split in self.corpus + } + + self.queries = { + split: {sample["_id"]: sample["text"] for sample in self.queries[split]} + for split in self.queries + } + + self.relevant_docs = { + split: { + sample["query-id"]: {sample["corpus-id"]: 1} + for sample in self.relevant_docs[split] + } + for split in self.relevant_docs + } + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/eng/NanoFiQA2018Retrieval.py b/mteb/tasks/Retrieval/eng/NanoFiQA2018Retrieval.py new file mode 100644 index 0000000000..1a3467c1d7 --- /dev/null +++ b/mteb/tasks/Retrieval/eng/NanoFiQA2018Retrieval.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +from datasets import load_dataset + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class NanoFiQA2018Retrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NanoFiQA2018Retrieval", + description="NanoFiQA2018 is a smaller subset of the Financial Opinion Mining and Question Answering dataset.", + reference="https://sites.google.com/view/fiqa/", + dataset={ + "path": "zeta-alpha-ai/NanoFiQA2018", + "revision": "4163ba032953d5044a7a6244261413f609c14342", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=["2018-01-01", "2018-12-31"], + domains=["Academic", "Social"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="""@inproceedings{ +thakur2021beir, +title={{BEIR}: A Heterogeneous Benchmark for Zero-shot Evaluation of Information Retrieval Models}, +author={Nandan Thakur and Nils Reimers and Andreas R{\"u}ckl{\'e} and Abhishek Srivastava and Iryna Gurevych}, +booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)}, +year={2021}, +url={https://openreview.net/forum?id=wCu6T5xFjeJ} +}""", + prompt={ + "query": "Given a financial question, retrieve user replies that best answer the question" + }, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus = load_dataset( + "zeta-alpha-ai/NanoFiQA2018", + "corpus", + revision="4163ba032953d5044a7a6244261413f609c14342", + ) + self.queries = load_dataset( + "zeta-alpha-ai/NanoFiQA2018", + "queries", + revision="4163ba032953d5044a7a6244261413f609c14342", + ) + self.relevant_docs = load_dataset( + "zeta-alpha-ai/NanoFiQA2018", + "qrels", + revision="4163ba032953d5044a7a6244261413f609c14342", + ) + + self.corpus = { + split: { + sample["_id"]: {"_id": sample["_id"], "text": sample["text"]} + for sample in self.corpus[split] + } + for split in self.corpus + } + + self.queries = { + split: {sample["_id"]: sample["text"] for sample in self.queries[split]} + for split in self.queries + } + + self.relevant_docs = { + split: { + sample["query-id"]: {sample["corpus-id"]: 1} + for sample in self.relevant_docs[split] + } + for split in self.relevant_docs + } + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/eng/NanoHotpotQARetrieval.py b/mteb/tasks/Retrieval/eng/NanoHotpotQARetrieval.py new file mode 100644 index 0000000000..4389aeafa8 --- /dev/null +++ b/mteb/tasks/Retrieval/eng/NanoHotpotQARetrieval.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +from datasets import load_dataset + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class NanoHotpotQARetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NanoHotpotQARetrieval", + description="NanoHotpotQARetrieval is a smaller subset of the " + + "HotpotQA dataset, which is a question answering dataset featuring natural, multi-hop questions, with strong" + + " supervision for supporting facts to enable more explainable question answering systems.", + reference="https://hotpotqa.github.io/", + dataset={ + "path": "zeta-alpha-ai/NanoHotpotQA", + "revision": "d79c0cdda980aba54842756770928035e1b61a51", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=["2018-01-01", "2018-12-31"], + domains=["Web", "Written"], + task_subtypes=["Question answering"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="""@inproceedings{yang-etal-2018-hotpotqa, + title = "{H}otpot{QA}: A Dataset for Diverse, Explainable Multi-hop Question Answering", + author = "Yang, Zhilin and + Qi, Peng and + Zhang, Saizheng and + Bengio, Yoshua and + Cohen, William and + Salakhutdinov, Ruslan and + Manning, Christopher D.", + editor = "Riloff, Ellen and + Chiang, David and + Hockenmaier, Julia and + Tsujii, Jun{'}ichi", + booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing", + month = oct # "-" # nov, + year = "2018", + address = "Brussels, Belgium", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/D18-1259", + doi = "10.18653/v1/D18-1259", + pages = "2369--2380", + abstract = "Existing question answering (QA) datasets fail to train QA systems to perform complex reasoning and provide explanations for answers. We introduce HotpotQA, a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowing QA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems{'} ability to extract relevant facts and perform necessary comparison. We show that HotpotQA is challenging for the latest QA systems, and the supporting facts enable models to improve performance and make explainable predictions.", +}""", + prompt={ + "query": "Given a multi-hop question, retrieve documents that can help answer the question" + }, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus = load_dataset( + "zeta-alpha-ai/NanoHotpotQA", + "corpus", + revision="d79c0cdda980aba54842756770928035e1b61a51", + ) + self.queries = load_dataset( + "zeta-alpha-ai/NanoHotpotQA", + "queries", + revision="d79c0cdda980aba54842756770928035e1b61a51", + ) + self.relevant_docs = load_dataset( + "zeta-alpha-ai/NanoHotpotQA", + "qrels", + revision="d79c0cdda980aba54842756770928035e1b61a51", + ) + + self.corpus = { + split: { + sample["_id"]: {"_id": sample["_id"], "text": sample["text"]} + for sample in self.corpus[split] + } + for split in self.corpus + } + + self.queries = { + split: {sample["_id"]: sample["text"] for sample in self.queries[split]} + for split in self.queries + } + + self.relevant_docs = { + split: { + sample["query-id"]: {sample["corpus-id"]: 1} + for sample in self.relevant_docs[split] + } + for split in self.relevant_docs + } + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/eng/NanoMSMARCORetrieval.py b/mteb/tasks/Retrieval/eng/NanoMSMARCORetrieval.py new file mode 100644 index 0000000000..8a2f51e7fd --- /dev/null +++ b/mteb/tasks/Retrieval/eng/NanoMSMARCORetrieval.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +from datasets import load_dataset + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class NanoMSMARCORetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NanoMSMARCORetrieval", + description="NanoMSMARCORetrieval is a smaller subset of MS MARCO, a collection of datasets focused on deep learning in search.", + reference="https://microsoft.github.io/msmarco/", + dataset={ + "path": "zeta-alpha-ai/NanoMSMARCO", + "revision": "7b8ff22f2771dc65ac5b439f222eb19a1f56abda", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=["2016-01-01", "2016-12-31"], + domains=["Web"], + task_subtypes=["Question answering"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="""@article{DBLP:journals/corr/NguyenRSGTMD16, + author = {Tri Nguyen and + Mir Rosenberg and + Xia Song and + Jianfeng Gao and + Saurabh Tiwary and + Rangan Majumder and + Li Deng}, + title = {{MS} {MARCO:} {A} Human Generated MAchine Reading COmprehension Dataset}, + journal = {CoRR}, + volume = {abs/1611.09268}, + year = {2016}, + url = {http://arxiv.org/abs/1611.09268}, + archivePrefix = {arXiv}, + eprint = {1611.09268}, + timestamp = {Mon, 13 Aug 2018 16:49:03 +0200}, + biburl = {https://dblp.org/rec/journals/corr/NguyenRSGTMD16.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +}""", + prompt={ + "query": "Given a web search query, retrieve relevant passages that answer the query" + }, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus = load_dataset( + "zeta-alpha-ai/NanoMSMARCO", + "corpus", + revision="7b8ff22f2771dc65ac5b439f222eb19a1f56abda", + ) + self.queries = load_dataset( + "zeta-alpha-ai/NanoMSMARCO", + "queries", + revision="7b8ff22f2771dc65ac5b439f222eb19a1f56abda", + ) + self.relevant_docs = load_dataset( + "zeta-alpha-ai/NanoMSMARCO", + "qrels", + revision="7b8ff22f2771dc65ac5b439f222eb19a1f56abda", + ) + + self.corpus = { + split: { + sample["_id"]: {"_id": sample["_id"], "text": sample["text"]} + for sample in self.corpus[split] + } + for split in self.corpus + } + + self.queries = { + split: {sample["_id"]: sample["text"] for sample in self.queries[split]} + for split in self.queries + } + + self.relevant_docs = { + split: { + sample["query-id"]: {sample["corpus-id"]: 1} + for sample in self.relevant_docs[split] + } + for split in self.relevant_docs + } + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/eng/NanoNFCorpusRetrieval.py b/mteb/tasks/Retrieval/eng/NanoNFCorpusRetrieval.py new file mode 100644 index 0000000000..0f6ac8533a --- /dev/null +++ b/mteb/tasks/Retrieval/eng/NanoNFCorpusRetrieval.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +from datasets import load_dataset + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class NanoNFCorpusRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NanoNFCorpusRetrieval", + description="NanoNFCorpus is a smaller subset of NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval.", + reference="https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/", + dataset={ + "path": "zeta-alpha-ai/NanoNFCorpus", + "revision": "dd542a7efb9ad2136b9e00768b60fca9038f8156", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=["2016-01-01", "2016-12-31"], + domains=["Medical", "Academic", "Written"], + task_subtypes=["Question answering"], + license="cc-by-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="""@inproceedings{boteva2016, + author = {Boteva, Vera and Gholipour, Demian and Sokolov, Artem and Riezler, Stefan}, + title = {A Full-Text Learning to Rank Dataset for Medical Information Retrieval}, + journal = {Proceedings of the 38th European Conference on Information Retrieval}, + journal-abbrev = {ECIR}, + year = {2016}, + city = {Padova}, + country = {Italy}, + url = {http://www.cl.uni-heidelberg.de/~riezler/publications/papers/ECIR2016.pdf} +}""", + prompt={ + "query": "Given a question, retrieve relevant documents that best answer the question" + }, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus = load_dataset( + "zeta-alpha-ai/NanoNFCorpus", + "corpus", + revision="dd542a7efb9ad2136b9e00768b60fca9038f8156", + ) + self.queries = load_dataset( + "zeta-alpha-ai/NanoNFCorpus", + "queries", + revision="dd542a7efb9ad2136b9e00768b60fca9038f8156", + ) + self.relevant_docs = load_dataset( + "zeta-alpha-ai/NanoNFCorpus", + "qrels", + revision="dd542a7efb9ad2136b9e00768b60fca9038f8156", + ) + + self.corpus = { + split: { + sample["_id"]: {"_id": sample["_id"], "text": sample["text"]} + for sample in self.corpus[split] + } + for split in self.corpus + } + + self.queries = { + split: {sample["_id"]: sample["text"] for sample in self.queries[split]} + for split in self.queries + } + + self.relevant_docs = { + split: { + sample["query-id"]: {sample["corpus-id"]: 1} + for sample in self.relevant_docs[split] + } + for split in self.relevant_docs + } + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/eng/NanoNQRetrieval.py b/mteb/tasks/Retrieval/eng/NanoNQRetrieval.py new file mode 100644 index 0000000000..5aa831f799 --- /dev/null +++ b/mteb/tasks/Retrieval/eng/NanoNQRetrieval.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +from datasets import load_dataset + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class NanoNQRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NanoNQRetrieval", + description="NanoNQ is a smaller subset of a dataset which contains questions from real users, and it requires QA systems to read and comprehend an entire Wikipedia article that may or may not contain the answer to the question.", + reference="https://ai.google.com/research/NaturalQuestions", + dataset={ + "path": "zeta-alpha-ai/NanoNQ", + "revision": "77540146379abf95df8326a3c5bb9eb21c7146c3", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=["2019-01-01", "2019-12-31"], + domains=["Academic", "Web"], + task_subtypes=["Question answering"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="""@article{47761,title = {Natural Questions: a Benchmark for Question Answering Research}, + author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh + and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee + and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le + and Slav Petrov},year = {2019},journal = {Transactions of the Association of Computational + Linguistics}}""", + prompt={ + "query": "Given a question, retrieve Wikipedia passages that answer the question" + }, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus = load_dataset( + "zeta-alpha-ai/NanoNQ", + "corpus", + revision="77540146379abf95df8326a3c5bb9eb21c7146c3", + ) + self.queries = load_dataset( + "zeta-alpha-ai/NanoNQ", + "queries", + revision="77540146379abf95df8326a3c5bb9eb21c7146c3", + ) + self.relevant_docs = load_dataset( + "zeta-alpha-ai/NanoNQ", + "qrels", + revision="77540146379abf95df8326a3c5bb9eb21c7146c3", + ) + + self.corpus = { + split: { + sample["_id"]: {"_id": sample["_id"], "text": sample["text"]} + for sample in self.corpus[split] + } + for split in self.corpus + } + + self.queries = { + split: {sample["_id"]: sample["text"] for sample in self.queries[split]} + for split in self.queries + } + + self.relevant_docs = { + split: { + sample["query-id"]: {sample["corpus-id"]: 1} + for sample in self.relevant_docs[split] + } + for split in self.relevant_docs + } + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/eng/NanoQuoraRetrieval.py b/mteb/tasks/Retrieval/eng/NanoQuoraRetrieval.py new file mode 100644 index 0000000000..1391d12b93 --- /dev/null +++ b/mteb/tasks/Retrieval/eng/NanoQuoraRetrieval.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +from datasets import load_dataset + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class NanoQuoraRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NanoQuoraRetrieval", + description="NanoQuoraRetrieval is a smaller subset of the " + + "QuoraRetrieval dataset, which is based on questions that are marked as duplicates on the Quora platform. Given a" + + " question, find other (duplicate) questions.", + reference="https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs", + dataset={ + "path": "zeta-alpha-ai/NanoQuoraRetrieval", + "revision": "2ab2d73e6c862026282808b913a34f4136928545", + }, + type="Retrieval", + category="s2s", + modalities=["text"], + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=["2017-01-01", "2017-12-31"], + domains=["Social"], + task_subtypes=["Duplicate Detection"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="""@misc{quora-question-pairs, + author = {DataCanary, hilfialkaff, Lili Jiang, Meg Risdal, Nikhil Dandekar, tomtung}, + title = {Quora Question Pairs}, + publisher = {Kaggle}, + year = {2017}, + url = {https://kaggle.com/competitions/quora-question-pairs} +}""", + prompt={ + "query": "Given a question, retrieve questions that are semantically equivalent to the given question" + }, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus = load_dataset( + "zeta-alpha-ai/NanoQuoraRetrieval", + "corpus", + revision="2ab2d73e6c862026282808b913a34f4136928545", + ) + self.queries = load_dataset( + "zeta-alpha-ai/NanoQuoraRetrieval", + "queries", + revision="2ab2d73e6c862026282808b913a34f4136928545", + ) + self.relevant_docs = load_dataset( + "zeta-alpha-ai/NanoQuoraRetrieval", + "qrels", + revision="2ab2d73e6c862026282808b913a34f4136928545", + ) + + self.corpus = { + split: { + sample["_id"]: {"_id": sample["_id"], "text": sample["text"]} + for sample in self.corpus[split] + } + for split in self.corpus + } + + self.queries = { + split: {sample["_id"]: sample["text"] for sample in self.queries[split]} + for split in self.queries + } + + self.relevant_docs = { + split: { + sample["query-id"]: {sample["corpus-id"]: 1} + for sample in self.relevant_docs[split] + } + for split in self.relevant_docs + } + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/eng/NanoSCIDOCSRetrieval.py b/mteb/tasks/Retrieval/eng/NanoSCIDOCSRetrieval.py new file mode 100644 index 0000000000..2d27e1a2dc --- /dev/null +++ b/mteb/tasks/Retrieval/eng/NanoSCIDOCSRetrieval.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +from datasets import load_dataset + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class NanoSCIDOCSRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NanoSCIDOCSRetrieval", + description="NanoFiQA2018 is a smaller subset of " + + "SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from citation" + + " prediction, to document classification and recommendation.", + reference="https://allenai.org/data/scidocs", + dataset={ + "path": "zeta-alpha-ai/NanoSCIDOCS", + "revision": "484eb90549fc3f0b9c42b3551e80ceb999515537", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=["2020-01-01", "2020-12-31"], + domains=["Academic", "Written", "Non-fiction"], + task_subtypes=[], + license="cc-by-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="""@inproceedings{specter2020cohan, + title={SPECTER: Document-level Representation Learning using Citation-informed Transformers}, + author={Arman Cohan and Sergey Feldman and Iz Beltagy and Doug Downey and Daniel S. Weld}, + booktitle={ACL}, + year={2020} +}""", + prompt={ + "query": "Given a scientific paper title, retrieve paper abstracts that are cited by the given paper" + }, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus = load_dataset( + "zeta-alpha-ai/NanoSCIDOCS", + "corpus", + revision="484eb90549fc3f0b9c42b3551e80ceb999515537", + ) + self.queries = load_dataset( + "zeta-alpha-ai/NanoSCIDOCS", + "queries", + revision="484eb90549fc3f0b9c42b3551e80ceb999515537", + ) + self.relevant_docs = load_dataset( + "zeta-alpha-ai/NanoSCIDOCS", + "qrels", + revision="484eb90549fc3f0b9c42b3551e80ceb999515537", + ) + + self.corpus = { + split: { + sample["_id"]: {"_id": sample["_id"], "text": sample["text"]} + for sample in self.corpus[split] + } + for split in self.corpus + } + + self.queries = { + split: {sample["_id"]: sample["text"] for sample in self.queries[split]} + for split in self.queries + } + + self.relevant_docs = { + split: { + sample["query-id"]: {sample["corpus-id"]: 1} + for sample in self.relevant_docs[split] + } + for split in self.relevant_docs + } + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/eng/NanoSciFactRetrieval.py b/mteb/tasks/Retrieval/eng/NanoSciFactRetrieval.py new file mode 100644 index 0000000000..aff949d319 --- /dev/null +++ b/mteb/tasks/Retrieval/eng/NanoSciFactRetrieval.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +from datasets import load_dataset + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class NanoSciFactRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NanoSciFactRetrieval", + description="NanoSciFact is a smaller subset of SciFact, which verifies scientific claims using evidence from the research literature containing scientific paper abstracts.", + reference="https://github.com/allenai/scifact", + dataset={ + "path": "zeta-alpha-ai/NanoSciFact", + "revision": "309f1d1ae3ae2e092444a8a0c25bed59b82318bc", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=["2018-01-01", "2018-12-31"], + domains=["Academic", "Medical", "Written"], + task_subtypes=["Claim verification"], + license="cc-by-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="""@inproceedings{specter2020cohan, + title={SPECTER: Document-level Representation Learning using Citation-informed Transformers}, + author={Arman Cohan and Sergey Feldman and Iz Beltagy and Doug Downey and Daniel S. Weld}, + booktitle={ACL}, + year={2020} +}""", + prompt={ + "query": "Given a scientific claim, retrieve documents that support or refute the claim" + }, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus = load_dataset( + "zeta-alpha-ai/NanoSciFact", + "corpus", + revision="309f1d1ae3ae2e092444a8a0c25bed59b82318bc", + ) + self.queries = load_dataset( + "zeta-alpha-ai/NanoSciFact", + "queries", + revision="309f1d1ae3ae2e092444a8a0c25bed59b82318bc", + ) + self.relevant_docs = load_dataset( + "zeta-alpha-ai/NanoSciFact", + "qrels", + revision="309f1d1ae3ae2e092444a8a0c25bed59b82318bc", + ) + + self.corpus = { + split: { + sample["_id"]: {"_id": sample["_id"], "text": sample["text"]} + for sample in self.corpus[split] + } + for split in self.corpus + } + + self.queries = { + split: {sample["_id"]: sample["text"] for sample in self.queries[split]} + for split in self.queries + } + + self.relevant_docs = { + split: { + sample["query-id"]: {sample["corpus-id"]: 1} + for sample in self.relevant_docs[split] + } + for split in self.relevant_docs + } + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/eng/NanoTouche2020Retrieval.py b/mteb/tasks/Retrieval/eng/NanoTouche2020Retrieval.py new file mode 100644 index 0000000000..656b5494a0 --- /dev/null +++ b/mteb/tasks/Retrieval/eng/NanoTouche2020Retrieval.py @@ -0,0 +1,94 @@ +from __future__ import annotations + +from datasets import load_dataset + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class NanoTouche2020Retrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NanoTouche2020Retrieval", + description="NanoTouche2020 is a smaller subset of Touché Task 1: Argument Retrieval for Controversial Questions.", + reference="https://webis.de/events/touche-20/shared-task-1.html", + dataset={ + "path": "zeta-alpha-ai/NanoTouche2020", + "revision": "0d2f26ed8c5ad309f95c7f9499c70a40e140fccd", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2020-09-23", "2020-09-23"), + domains=["Academic"], + task_subtypes=["Question answering"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="""@dataset{potthast_2022_6862281, + author = {Potthast, Martin and + Gienapp, Lukas and + Wachsmuth, Henning and + Hagen, Matthias and + Fröbe, Maik and + Bondarenko, Alexander and + Ajjour, Yamen and + Stein, Benno}, + title = {{Touché20-Argument-Retrieval-for-Controversial- + Questions}}, + month = jul, + year = 2022, + publisher = {Zenodo}, + doi = {10.5281/zenodo.6862281}, + url = {https://doi.org/10.5281/zenodo.6862281} +}""", + prompt={ + "query": "Given a question, retrieve detailed and persuasive arguments that answer the question" + }, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus = load_dataset( + "zeta-alpha-ai/NanoTouche2020", + "corpus", + revision="0d2f26ed8c5ad309f95c7f9499c70a40e140fccd", + ) + self.queries = load_dataset( + "zeta-alpha-ai/NanoTouche2020", + "queries", + revision="0d2f26ed8c5ad309f95c7f9499c70a40e140fccd", + ) + self.relevant_docs = load_dataset( + "zeta-alpha-ai/NanoTouche2020", + "qrels", + revision="0d2f26ed8c5ad309f95c7f9499c70a40e140fccd", + ) + + self.corpus = { + split: { + sample["_id"]: {"_id": sample["_id"], "text": sample["text"]} + for sample in self.corpus[split] + } + for split in self.corpus + } + + self.queries = { + split: {sample["_id"]: sample["text"] for sample in self.queries[split]} + for split in self.queries + } + + self.relevant_docs = { + split: { + sample["query-id"]: {sample["corpus-id"]: 1} + for sample in self.relevant_docs[split] + } + for split in self.relevant_docs + } + + self.data_loaded = True From 9de7f20e091a674e984e6b89b099a5f31cd09cd7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 18 Dec 2024 17:29:07 +0000 Subject: [PATCH 18/31] Update tasks table --- docs/tasks.md | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/docs/tasks.md b/docs/tasks.md index 70cff09bdd..f4ec3c792e 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -361,6 +361,19 @@ The following tables give you an overview of the tasks in MTEB. | [NYSJudicialEthicsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [NaijaSenti](https://github.com/hausanlp/NaijaSenti) | ['hau', 'ibo', 'pcm', 'yor'] | Classification | s2s | [Social, Written] | None | None | | [NamaaMrTydiReranking](https://huggingface.co/NAMAA-Space) (Muennighoff et al., 2022) | ['ara'] | Reranking | s2s | [Encyclopaedic, Written] | None | None | +| [NanoArguAnaRetrieval](http://argumentation.bplaced.net/arguana/data) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Medical, Written] | None | None | +| [NanoClimateFeverRetrieval](https://arxiv.org/abs/2012.00614) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Non-fiction, Academic, News] | None | None | +| [NanoDBPediaRetrieval](https://huggingface.co/datasets/zeta-alpha-ai/NanoDBPedia) (Lehmann et al., 2015) | ['eng'] | Retrieval | s2p | [Encyclopaedic] | None | None | +| [NanoFEVERRetrieval](https://fever.ai/) | ['eng'] | Retrieval | s2p | [Academic, Encyclopaedic] | None | None | +| [NanoFiQA2018Retrieval](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['eng'] | Retrieval | s2p | [Academic, Social] | None | None | +| [NanoHotpotQARetrieval](https://hotpotqa.github.io/) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | +| [NanoMSMARCORetrieval](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Web] | None | None | +| [NanoNFCorpusRetrieval](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Medical, Academic, Written] | None | None | +| [NanoNQRetrieval](https://ai.google.com/research/NaturalQuestions) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | [Academic, Web] | None | None | +| [NanoQuoraRetrieval](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | [Social] | None | None | +| [NanoSCIDOCSRetrieval](https://allenai.org/data/scidocs) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Written, Non-fiction] | None | None | +| [NanoSciFactRetrieval](https://github.com/allenai/scifact) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | +| [NanoTouche2020Retrieval](https://webis.de/events/touche-20/shared-task-1.html) | ['eng'] | Retrieval | s2p | [Academic] | None | None | | [NarrativeQARetrieval](https://metatext.io/datasets/narrativeqa) (Tomáš Kočiský, 2017) | ['eng'] | Retrieval | s2p | | None | None | | [NepaliNewsClassification](https://github.com/goru001/nlp-for-nepali) | ['nep'] | Classification | s2s | [News, Written] | None | None | | [NeuCLIR2022Retrieval](https://neuclir.github.io/) (Lawrie et al., 2023) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | None | @@ -890,7 +903,7 @@ The following tables give you an overview of the tasks in MTEB. | ell | Modern Greek (1453-) | Indo-European | 3 | 6 | 1 | 0 | 1 | 2 | 0 | 3 | 0 | 0 | 0 | 16 | | emi | Mussau-Emira | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | emp | Northern Emberá | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eng | English | Indo-European | 16 | 143 | 16 | 3 | 1 | 8 | 8 | 92 | 13 | 2 | 1 | 303 | +| eng | English | Indo-European | 16 | 143 | 16 | 3 | 1 | 8 | 8 | 105 | 13 | 2 | 1 | 316 | | enq | Enga | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | epo | Esperanto | Artificial Language | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | | eri | Ogea | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1662,7 +1675,7 @@ The following tables give you an overview of the tasks in MTEB. | zty | Yatee Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zul | Zulu | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | | zyp | Zyphe Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| Total | None | None | None | 1394 | 795 | 304 | 3 | 28 | 67 | 51 | 460 | 85 | 2 | 2 | +| Total | None | None | None | 1394 | 795 | 304 | 3 | 28 | 67 | 51 | 473 | 85 | 2 | 2 |
From 48cb97d1e75cdf8260283d6628ac515141ed1e92 Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Wed, 18 Dec 2024 23:28:23 +0500 Subject: [PATCH 19/31] Feat: Evaluate missing languages (#1584) * init * fix tests * update mock retrieval * update tests * use subsets instead of langs * Apply suggestions from code review Co-authored-by: Isaac Chung * fix tests * add to readme * rename subset in readme --------- Co-authored-by: Isaac Chung --- README.md | 15 ++ mteb/abstasks/AbsTask.py | 6 +- mteb/abstasks/AbsTaskBitextMining.py | 7 +- mteb/abstasks/AbsTaskClassification.py | 3 + mteb/abstasks/AbsTaskInstructionRetrieval.py | 7 +- .../AbsTaskMultilabelClassification.py | 4 + mteb/abstasks/AbsTaskRetrieval.py | 3 + mteb/evaluation/MTEB.py | 135 ++++++++--- tests/test_benchmark/mock_tasks.py | 76 +++++- .../test_evaluation/test_split_evaluation.py | 220 +++++++++++++++++- 10 files changed, 441 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index ca57669da4..f556cad894 100644 --- a/README.md +++ b/README.md @@ -209,6 +209,21 @@ Note that the public leaderboard uses the test splits for all datasets except MS + +
+ Selecting evaluation subset + +### Selecting evaluation subset +You can evaluate only on selected subsets. For example, if you want to evaluate only the `subset_name_to_run` subset of all tasks, do the following: + +```python +evaluation.run(model, eval_subsets=["subset_name_to_run"]) +``` + +Monolingual tasks have `default` subset, other tasks have subsets that are specific to the dataset. + +
+
Using a custom model diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py index 4bc64c86bf..443725ec7f 100644 --- a/mteb/abstasks/AbsTask.py +++ b/mteb/abstasks/AbsTask.py @@ -89,17 +89,18 @@ def evaluate( self, model: Encoder, split: str = "test", + subsets_to_run: list[HFSubset] | None = None, *, encode_kwargs: dict[str, Any] = {}, **kwargs: Any, ) -> dict[HFSubset, ScoresDict]: """Evaluates a Sentence Embedding Model on the task. - Returns a dict (that can be serialized to json). Args: model: Sentence embedding method. Implements a encode(sentences) method, that encodes sentences and returns a numpy matrix with the sentence embeddings split: Which datasplit to be used. + subsets_to_run: List of HFSubsets to evaluate. If None, all subsets are evaluated. encode_kwargs: Additional keyword arguments that are passed to the model's `encode` method. kwargs: Additional keyword arguments that are passed to the _evaluate_subset method. """ @@ -111,6 +112,9 @@ def evaluate( scores = {} hf_subsets = list(self.dataset.keys()) if self.is_multilingual else ["default"] + if subsets_to_run is not None: + hf_subsets = [s for s in hf_subsets if s in subsets_to_run] + for hf_subset in hf_subsets: logger.info( f"\nTask: {self.metadata_dict['name']}, split: {split}, subset: {hf_subset}. Running..." diff --git a/mteb/abstasks/AbsTaskBitextMining.py b/mteb/abstasks/AbsTaskBitextMining.py index 59d64039fd..1c373cc2f7 100644 --- a/mteb/abstasks/AbsTaskBitextMining.py +++ b/mteb/abstasks/AbsTaskBitextMining.py @@ -67,7 +67,8 @@ def __init__(self, **kwargs): def evaluate( self, model: Encoder, - split: str, + split: str = "test", + subsets_to_run: list[HFSubset] | None = None, *, encode_kwargs: dict[str, Any] = {}, **kwargs, @@ -77,6 +78,10 @@ def evaluate( hf_subsets = list(self.dataset) if self.is_multilingual else ["default"] + # If subsets_to_run is specified, filter the hf_subsets accordingly + if subsets_to_run is not None: + hf_subsets = [s for s in hf_subsets if s in subsets_to_run] + scores = {} if self.parallel_subsets: scores = self._evaluate_subset( diff --git a/mteb/abstasks/AbsTaskClassification.py b/mteb/abstasks/AbsTaskClassification.py index 62908c98a4..eac41856e8 100644 --- a/mteb/abstasks/AbsTaskClassification.py +++ b/mteb/abstasks/AbsTaskClassification.py @@ -95,6 +95,7 @@ def evaluate( model, eval_split: str = "test", train_split: str = "train", + subsets_to_run: list[HFSubset] | None = None, *, encode_kwargs: dict[str, Any] = {}, **kwargs, @@ -104,6 +105,8 @@ def evaluate( scores = {} hf_subsets = list(self.dataset) if self.is_multilingual else ["default"] + if subsets_to_run is not None: + hf_subsets = [s for s in hf_subsets if s in subsets_to_run] for hf_subset in hf_subsets: logger.info( diff --git a/mteb/abstasks/AbsTaskInstructionRetrieval.py b/mteb/abstasks/AbsTaskInstructionRetrieval.py index 219426fe63..ed24a7cc87 100644 --- a/mteb/abstasks/AbsTaskInstructionRetrieval.py +++ b/mteb/abstasks/AbsTaskInstructionRetrieval.py @@ -516,6 +516,7 @@ def evaluate( self, model: Encoder, split: str = "test", + subsets_to_run: list[str] | None = None, *, encode_kwargs: dict[str, Any] = {}, **kwargs, @@ -528,7 +529,11 @@ def evaluate( ) scores = {} if self.is_multilingual: - for lang in self.hf_subsets: + hf_subsets = self.hf_subsets + if subsets_to_run is not None: + hf_subsets = [s for s in hf_subsets if s in subsets_to_run] + + for lang in hf_subsets: logger.info(f"Language: {lang}") scores[lang] = self._evaluate_subset_lang( retriever, diff --git a/mteb/abstasks/AbsTaskMultilabelClassification.py b/mteb/abstasks/AbsTaskMultilabelClassification.py index 38d3722ff2..034b607c26 100644 --- a/mteb/abstasks/AbsTaskMultilabelClassification.py +++ b/mteb/abstasks/AbsTaskMultilabelClassification.py @@ -121,6 +121,7 @@ def evaluate( model: Encoder, eval_split: str = "test", train_split: str = "train", + subsets_to_run: list[HFSubset] | None = None, *, encode_kwargs: dict[str, Any] = {}, **kwargs: Any, @@ -130,6 +131,9 @@ def evaluate( scores = {} hf_subsets = list(self.dataset) if self.is_multilingual else ["default"] + # If subsets_to_run is specified, filter the hf_subsets accordingly + if subsets_to_run is not None: + hf_subsets = [s for s in hf_subsets if s in subsets_to_run] for hf_subset in hf_subsets: logger.info( diff --git a/mteb/abstasks/AbsTaskRetrieval.py b/mteb/abstasks/AbsTaskRetrieval.py index 95746e1a2d..8a780658f7 100644 --- a/mteb/abstasks/AbsTaskRetrieval.py +++ b/mteb/abstasks/AbsTaskRetrieval.py @@ -304,6 +304,7 @@ def evaluate( self, model, split: str = "test", + subsets_to_run: list[HFSubset] | None = None, *, encode_kwargs: dict[str, Any] = {}, **kwargs, @@ -317,6 +318,8 @@ def evaluate( scores = {} hf_subsets = list(self.hf_subsets) if self.is_multilingual else ["default"] + if subsets_to_run is not None: + hf_subsets = [s for s in hf_subsets if s in subsets_to_run] for hf_subset in hf_subsets: logger.info(f"Subset: {hf_subset}") diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index 29f7ba5f61..cf454fc25a 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -293,8 +293,9 @@ def load_tasks_data(self): def _run_eval( task: AbsTask, model: Encoder, - split, - output_folder, + split: str, + output_folder: str | None, + subsets_to_run: list[str] | None = None, *, encode_kwargs: dict[str, Any], **kwargs: Any, @@ -303,6 +304,7 @@ def _run_eval( results = task.evaluate( model, split, + subsets_to_run=subsets_to_run, output_folder=output_folder, encode_kwargs=encode_kwargs, **kwargs, @@ -379,7 +381,8 @@ def run( model: SentenceTransformer | Encoder, verbosity: int = 1, output_folder: str | None = "results", - eval_splits=None, + eval_splits: list[str] | None = None, + eval_subsets: list[str] | None = None, overwrite_results: bool = False, raise_error: bool = True, co2_tracker: bool = False, @@ -398,6 +401,7 @@ def run( output_folder: Folder where the results will be saved. Default to 'results'. Where it will save the results in the format: `{output_folder}/{model_name}/{model_revision}/{task_name}.json`. eval_splits: List of splits to evaluate on. If None, the splits are taken from the task metadata. + eval_subsets: List of subsets to evaluate on. If None, the subsets are taken from the task metadata. overwrite_results: Whether to overwrite existing results. raise_error: Whether to raise an error if an exception occurs during evaluation. co2_tracker: Whether to enable or disable CO2 emissions tracker using codecarbon. @@ -455,9 +459,16 @@ def run( f"\n\n********************** Evaluating {task.metadata.name} **********************" ) + task_eval_splits = ( + eval_splits if eval_splits is not None else task.eval_splits + ) + task_subsets = list(task.metadata.hf_subsets_to_langscripts.keys()) + + existing_results = None + save_path = None + if output_path: save_path = output_path / f"{task.metadata.name}{task.save_suffix}.json" - existing_results = None if save_path.exists(): existing_results = TaskResult.from_disk(save_path) @@ -469,38 +480,53 @@ def run( del self.tasks[0] # empty memory continue - task_eval_splits = ( - eval_splits if eval_splits is not None else task.eval_splits - ) - missing_splits = self._get_missing_splits( - existing_results, task_eval_splits - ) - - if not missing_splits and existing_results: - evaluation_results.append(existing_results) + # Unified call to get missing splits and subsets + missing_evaluations = self._get_missing_evaluations( + existing_results, + task_eval_splits, + task_subsets, + eval_subsets, + ) - # no splits are evaluated. - self.last_evaluated_splits[task.metadata.name] = [] - del self.tasks[0] - continue + # Determine final splits to run + final_splits_to_run = [] + # We need to run any split that is fully missing or has missing subsets + for sp, info in missing_evaluations.items(): + if info["whole_split_missing"] or info["missing_subsets"]: + final_splits_to_run.append(sp) - if missing_splits: + # If no splits need to be run and results exist, skip + if not final_splits_to_run: + if existing_results is not None: + evaluation_results.append(existing_results) + else: logger.info( - f"Running evaluation for missing splits: {missing_splits}" + f"No splits to evaluate for {task.metadata.name}. Skipping evaluation." ) + self.last_evaluated_splits[task.metadata.name] = [] + del self.tasks[0] + continue try: task.check_if_dataset_is_superseded() task.load_data(eval_splits=task_eval_splits, **kwargs) - # run evaluation task_results = {} evaluation_time = 0 kg_co2_emissions: int | None = 0 if co2_tracker else None self.last_evaluated_splits[task.metadata.name] = [] - for split in missing_splits: + for split in final_splits_to_run: + info = missing_evaluations[split] + + # Determine subsets to run for this split + # If the whole split is missing, run all required subsets + # If only some subsets are missing, run only those + subsets_to_run = info["missing_subsets"] + if info["whole_split_missing"] and task_subsets is None: + subsets_to_run = ["default"] + if co2_tracker: try: from codecarbon import EmissionsTracker @@ -508,7 +534,6 @@ def run( raise ImportError( "To use the CO2 emissions tracker, please install codecarbon using 'pip install codecarbon'" ) - with EmissionsTracker( save_to_file=False, save_to_api=False, logging_logger=logger ) as tracker: @@ -516,8 +541,8 @@ def run( task, model, split, - output_folder, encode_kwargs=encode_kwargs, + subsets_to_run=subsets_to_run, **kwargs, ) @@ -530,12 +555,11 @@ def run( model, split, output_folder, + subsets_to_run=subsets_to_run, encode_kwargs=encode_kwargs, **kwargs, ) - self.last_evaluated_splits[task.metadata.name].append(split) - logger.info( f"Evaluation for {task.metadata_dict['name']} on {split} took {tock - tick:.2f} seconds" ) @@ -543,8 +567,11 @@ def run( task_results[split] = results if verbosity >= 1: - logger.info(f"Scores: {results}") + logger.info(f"Scores: {task_results[split]}") + + self.last_evaluated_splits[task.metadata.name].append(split) + # Create new TaskResult new_results = TaskResult.from_task_results( task, task_results, @@ -552,6 +579,9 @@ def run( kg_co2_emissions=kg_co2_emissions, ) + # Merge with existing if needed + if output_path and save_path.exists(): + existing_results = TaskResult.from_disk(save_path) if existing_results: merged_results = self._merge_results(existing_results, new_results) else: @@ -637,3 +667,56 @@ def get_last_evaluated_splits(self): return deepcopy( {task: list(splits) for task, splits in self.last_evaluated_splits.items()} ) + + @staticmethod + def _get_missing_evaluations( + existing_results: TaskResult | None, + task_eval_splits: list[str], + task_eval_langs: list[str], + eval_subsets: list[str] | None, + ) -> dict[str, dict[str, Any]]: + """Return a dictionary for each split, indicating if the whole split is missing and which subsets are missing.""" + missing_evaluations = { + split: {"whole_split_missing": False, "missing_subsets": []} + for split in task_eval_splits + } + + # Determine subsets to consider if multilingual + if eval_subsets is None: + # If no eval_langs specified, consider all subsets + subsets_to_consider = task_eval_langs + else: + subsets_to_consider = [ + subset for subset in task_eval_langs if subset in eval_subsets + ] + + # If no existing results, all splits and subsets are missing + if existing_results is None: + for split in task_eval_splits: + missing_evaluations[split]["whole_split_missing"] = True + missing_evaluations[split]["missing_subsets"] = list( + subsets_to_consider + ) + return missing_evaluations + + # If we have existing results, check which splits and subsets are missing + for split in task_eval_splits: + if split not in existing_results.scores: + # Whole split missing + missing_evaluations[split]["whole_split_missing"] = True + missing_evaluations[split]["missing_subsets"] = list( + subsets_to_consider + ) + else: + # Some subsets may be missing + existing_subsets = { + score_dict["hf_subset"] + for score_dict in existing_results.scores[split] + } + missing_subsets = [ + s for s in subsets_to_consider if s not in existing_subsets + ] + if missing_subsets: + missing_evaluations[split]["missing_subsets"] = missing_subsets + + return missing_evaluations diff --git a/tests/test_benchmark/mock_tasks.py b/tests/test_benchmark/mock_tasks.py index 1442902288..d3a11b2a4b 100644 --- a/tests/test_benchmark/mock_tasks.py +++ b/tests/test_benchmark/mock_tasks.py @@ -1349,6 +1349,62 @@ def load_data(self, **kwargs): class MockMultilingualRetrievalTask(AbsTaskRetrieval, MultilingualTask): expected_stats = { + "val": { + "number_of_characters": 224, + "num_samples": 8, + "num_queries": 4, + "num_documents": 4, + "min_document_length": 23, + "average_document_length": 26.0, + "max_document_length": 29, + "unique_documents": 4, + "min_query_length": 27, + "average_query_length": 30.0, + "max_query_length": 33, + "unique_queries": 4, + "min_relevant_docs_per_query": 2, + "average_relevant_docs_per_query": 2.0, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 4, + "hf_subset_descriptive_stats": { + "eng": { + "number_of_characters": 112, + "num_samples": 4, + "num_queries": 2, + "num_documents": 2, + "min_document_length": 23, + "average_document_length": 26.0, + "max_document_length": 29, + "unique_documents": 2, + "min_query_length": 27, + "average_query_length": 30.0, + "max_query_length": 33, + "unique_queries": 2, + "min_relevant_docs_per_query": 2, + "average_relevant_docs_per_query": 2.0, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 2, + }, + "fra": { + "number_of_characters": 112, + "num_samples": 4, + "num_queries": 2, + "num_documents": 2, + "min_document_length": 23, + "average_document_length": 26.0, + "max_document_length": 29, + "unique_documents": 2, + "min_query_length": 27, + "average_query_length": 30.0, + "max_query_length": 33, + "unique_queries": 2, + "min_relevant_docs_per_query": 2, + "average_relevant_docs_per_query": 2.0, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 2, + }, + }, + }, "test": { "number_of_characters": 224, "num_samples": 8, @@ -1404,14 +1460,14 @@ class MockMultilingualRetrievalTask(AbsTaskRetrieval, MultilingualTask): "unique_relevant_docs": 2, }, }, - } + }, } metadata = TaskMetadata( type="Retrieval", name="MockMultilingualRetrievalTask", main_score="ndcg_at_10", - **general_args, # type: ignore + **dict(general_args | {"eval_splits": ["val", "test"]}), # type: ignore ) metadata.eval_langs = multilingual_eval_langs @@ -1420,14 +1476,22 @@ def load_data(self, **kwargs): "test": { "q1": "This is a test sentence", "q2": "This is another test sentence", - } + }, + "val": { + "q1": "This is a test sentence", + "q2": "This is another test sentence", + }, } self.queries = {"eng": queries, "fra": queries} corpus = { "test": { "d1": "This is a positive sentence", "d2": "This is another positive sentence", - } + }, + "val": { + "d1": "This is a positive sentence", + "d2": "This is another positive sentence", + }, } self.corpus = {"eng": corpus, "fra": corpus} @@ -1436,6 +1500,10 @@ def load_data(self, **kwargs): "q1": {"d1": 1, "d2": 0}, "q2": {"d1": 0, "d2": 1}, }, + "val": { + "q1": {"d1": 1, "d2": 0}, + "q2": {"d1": 0, "d2": 1}, + }, } self.relevant_docs = { "eng": relevant_docs, diff --git a/tests/test_evaluation/test_split_evaluation.py b/tests/test_evaluation/test_split_evaluation.py index a2ca249747..c79f25ac42 100644 --- a/tests/test_evaluation/test_split_evaluation.py +++ b/tests/test_evaluation/test_split_evaluation.py @@ -7,6 +7,7 @@ MockSentenceTransformer, ) from tests.test_benchmark.mock_tasks import ( + MockMultilingualRetrievalTask, MockRetrievalTask, ) @@ -21,6 +22,11 @@ def tasks(): return [MockRetrievalTask()] +@pytest.fixture +def multilingual_tasks(): + return [MockMultilingualRetrievalTask()] + + def test_all_splits_evaluated(model, tasks, tmp_path): evaluation = MTEB(tasks=tasks) results = evaluation.run( @@ -34,6 +40,7 @@ def test_all_splits_evaluated(model, tasks, tmp_path): last_evaluated_splits = evaluation.get_last_evaluated_splits() assert set(last_evaluated_splits["MockRetrievalTask"]) == {"val", "test"} assert len(last_evaluated_splits["MockRetrievalTask"]) == 2 + assert results[0].scores.keys() == {"val", "test"} def test_one_missing_split(model, tasks, tmp_path): @@ -49,6 +56,7 @@ def test_one_missing_split(model, tasks, tmp_path): last_evaluated_splits = evaluation.get_last_evaluated_splits() assert set(last_evaluated_splits["MockRetrievalTask"]) == {"val"} assert len(last_evaluated_splits["MockRetrievalTask"]) == 1 + assert results[0].scores.keys() == {"val"} results2 = evaluation.run( model, @@ -62,11 +70,12 @@ def test_one_missing_split(model, tasks, tmp_path): last_evaluated_splits = evaluation.get_last_evaluated_splits() assert set(last_evaluated_splits["MockRetrievalTask"]) == {"test"} assert len(last_evaluated_splits["MockRetrievalTask"]) == 1 + assert results2[0].scores.keys() == {"test", "val"} def test_no_missing_splits(model, tasks, tmp_path): evaluation = MTEB(tasks=tasks) - _ = evaluation.run( + results = evaluation.run( model, eval_splits=["val", "test"], output_folder=str(tmp_path / "testcase3"), @@ -76,9 +85,10 @@ def test_no_missing_splits(model, tasks, tmp_path): last_evaluated_splits = evaluation.get_last_evaluated_splits() assert "MockRetrievalTask" in last_evaluated_splits assert len(last_evaluated_splits["MockRetrievalTask"]) == 2 + assert results[0].scores.keys() == {"test", "val"} evaluation = MTEB(tasks=tasks) - _ = evaluation.run( + results = evaluation.run( model, eval_splits=["val", "test"], output_folder=str(tmp_path / "testcase3"), @@ -89,3 +99,209 @@ def test_no_missing_splits(model, tasks, tmp_path): last_evaluated_splits = evaluation.get_last_evaluated_splits() assert "MockRetrievalTask" in last_evaluated_splits assert len(last_evaluated_splits["MockRetrievalTask"]) == 0 + assert results[0].scores.keys() == {"test", "val"} + + +def test_all_languages_evaluated(model, multilingual_tasks, tmp_path): + evaluation = MTEB(tasks=multilingual_tasks) + results = evaluation.run( + model, + eval_splits=["test"], + output_folder=str(tmp_path / "all_lang_evaluated"), + verbosity=2, + eval_subsets=None, + ) + assert "MockMultilingualRetrievalTask" == results[0].task_name + last_evaluated_splits = evaluation.get_last_evaluated_splits() + assert "MockMultilingualRetrievalTask" in last_evaluated_splits + assert len(last_evaluated_splits["MockMultilingualRetrievalTask"]) == 1 + assert last_evaluated_splits["MockMultilingualRetrievalTask"] == ["test"] + assert results[0].scores.keys() == {"test"} + assert len(results[0].scores["test"]) == 2 + + +def test_missing_language(model, multilingual_tasks, tmp_path): + evaluation = MTEB(tasks=multilingual_tasks) + results = evaluation.run( + model, + eval_splits=["test"], + output_folder=str(tmp_path / "missing_lang_test"), + verbosity=2, + eval_subsets=["eng"], + ) + + assert "MockMultilingualRetrievalTask" == results[0].task_name + last_evaluated_splits = evaluation.get_last_evaluated_splits() + assert "MockMultilingualRetrievalTask" in last_evaluated_splits + assert len(last_evaluated_splits["MockMultilingualRetrievalTask"]) == 1 + assert last_evaluated_splits["MockMultilingualRetrievalTask"] == ["test"] + assert results[0].scores.keys() == {"test"} + assert results[0].languages == ["eng"] + + results = evaluation.run( + model, + eval_splits=["test"], + output_folder=str(tmp_path / "missing_lang_test"), + verbosity=2, + eval_subsets=["eng", "fra"], + overwrite_results=True, + ) + + last_evaluated_splits = evaluation.get_last_evaluated_splits() + assert len(last_evaluated_splits["MockMultilingualRetrievalTask"]) == 1 + assert last_evaluated_splits["MockMultilingualRetrievalTask"] == ["test"] + assert sorted(results[0].languages) == ["eng", "fra"] + assert results[0].scores.keys() == {"test"} + assert len(results[0].scores["test"]) == 2 + + +def test_no_missing_languages(model, multilingual_tasks, tmp_path): + evaluation = MTEB(tasks=multilingual_tasks) + results = evaluation.run( + model, + eval_splits=["test"], + output_folder=str(tmp_path / "no_missing_lang_test"), + verbosity=2, + eval_subsets=["eng", "fra"], + ) + last_evaluated_splits = evaluation.get_last_evaluated_splits() + assert "MockMultilingualRetrievalTask" in last_evaluated_splits + assert len(last_evaluated_splits["MockMultilingualRetrievalTask"]) == 1 + assert results[0].scores.keys() == {"test"} + assert len(results[0].scores["test"]) == 2 + assert sorted(results[0].languages) == ["eng", "fra"] + + evaluation = MTEB(tasks=multilingual_tasks) + results = evaluation.run( + model, + eval_splits=["test"], + output_folder=str(tmp_path / "no_missing_lang_test"), + verbosity=2, + eval_subsets=["eng", "fra"], + overwrite_results=True, + ) + last_evaluated_splits = evaluation.get_last_evaluated_splits() + assert "MockMultilingualRetrievalTask" in last_evaluated_splits + assert len(last_evaluated_splits["MockMultilingualRetrievalTask"]) == 0 + assert results[0].scores.keys() == {"test"} + assert len(results[0].scores["test"]) == 2 + assert sorted(results[0].languages) == ["eng", "fra"] + + +def test_partial_languages(model, multilingual_tasks, tmp_path): + evaluation = MTEB(tasks=multilingual_tasks) + results = evaluation.run( + model, + eval_splits=["test"], + output_folder=str(tmp_path / "partial_lang_test"), + verbosity=2, + eval_subsets=["fra"], + ) + last_evaluated_splits = evaluation.get_last_evaluated_splits() + assert len(last_evaluated_splits["MockMultilingualRetrievalTask"]) == 1 + assert last_evaluated_splits["MockMultilingualRetrievalTask"] == ["test"] + assert results[0].scores.keys() == {"test"} + assert len(results[0].scores["test"]) == 1 + assert results[0].languages == ["fra"] + + results = evaluation.run( + model, + eval_splits=["test"], + output_folder=str(tmp_path / "partial_lang_test"), + verbosity=2, + eval_subsets=["fra", "eng"], + overwrite_results=True, + ) + last_evaluated_splits = evaluation.get_last_evaluated_splits() + assert len(last_evaluated_splits["MockMultilingualRetrievalTask"]) == 1 + assert last_evaluated_splits["MockMultilingualRetrievalTask"] == ["test"] + assert results[0].scores.keys() == {"test"} + assert len(results[0].scores["test"]) == 2 + assert sorted(results[0].languages) == ["eng", "fra"] + + +def test_multilingual_one_missing_split_no_missing_lang( + model, multilingual_tasks, tmp_path +): + evaluation = MTEB(tasks=multilingual_tasks) + results = evaluation.run( + model, + eval_splits=["val"], + output_folder=str(tmp_path / "partial_langs_partial_splits"), + verbosity=2, + eval_subsets=["eng", "fra"], + ) + + last_evaluated_splits = evaluation.get_last_evaluated_splits() + assert len(last_evaluated_splits["MockMultilingualRetrievalTask"]) == 1 + assert set(last_evaluated_splits["MockMultilingualRetrievalTask"]) == {"val"} + assert sorted(results[0].languages) == ["eng", "fra"] + assert results[0].scores.keys() == {"val"} + assert len(results[0].scores["val"]) == 2 + + results = evaluation.run( + model, + eval_splits=["val", "test"], + output_folder=str(tmp_path / "partial_langs_partial_splits"), + verbosity=2, + eval_subsets=["eng", "fra"], + overwrite_results=True, + ) + + last_evaluated_splits = evaluation.get_last_evaluated_splits() + assert set(last_evaluated_splits["MockMultilingualRetrievalTask"]) == {"test"} + assert sorted(results[0].languages) == ["eng", "fra"] + assert results[0].scores.keys() == {"test", "val"} + assert len(results[0].scores["test"]) == 2 + assert len(results[0].scores["val"]) == 2 + + +def test_multilingual_one_missing_lang_in_one_split( + model, multilingual_tasks, tmp_path +): + evaluation = MTEB(tasks=multilingual_tasks) + results = evaluation.run( + model, + eval_splits=["val"], + output_folder=str(tmp_path / "one_lang_one_split"), + verbosity=2, + eval_subsets=["eng", "fra"], + ) + + last_evaluated_splits = evaluation.get_last_evaluated_splits() + assert set(last_evaluated_splits["MockMultilingualRetrievalTask"]) == {"val"} + assert sorted(results[0].languages) == ["eng", "fra"] + assert results[0].scores.keys() == {"val"} + assert len(results[0].scores["val"]) == 2 + + results = evaluation.run( + model, + eval_splits=["val", "test"], + output_folder=str(tmp_path / "one_lang_one_split"), + verbosity=2, + eval_subsets=["eng"], + overwrite_results=True, + ) + + last_evaluated_splits = evaluation.get_last_evaluated_splits() + assert set(last_evaluated_splits["MockMultilingualRetrievalTask"]) == {"test"} + assert sorted(results[0].languages) == ["eng", "fra"] + assert results[0].scores.keys() == {"test", "val"} + assert len(results[0].scores["test"]) == 1 + assert len(results[0].scores["val"]) == 2 + + results = evaluation.run( + model, + eval_splits=["test"], + output_folder=str(tmp_path / "one_lang_one_split"), + verbosity=2, + eval_subsets=["eng", "fra"], + overwrite_results=True, + ) + + last_evaluated_splits = evaluation.get_last_evaluated_splits() + assert set(last_evaluated_splits["MockMultilingualRetrievalTask"]) == {"test"} + assert sorted(results[0].languages) == ["eng", "fra"] + # output merged result with previous results + assert results[0].scores.keys() == {"test", "val"} + assert len(results[0].scores["test"]) == 2 From ad05983fc3e44afc9087328f010a06ceb83f6f7d Mon Sep 17 00:00:00 2001 From: Aashka Trivedi Date: Thu, 19 Dec 2024 03:24:10 -0500 Subject: [PATCH 20/31] Add IBM Granite Embedding Models (#1613) * add IBM granite embedding models * lint formatting * add adapted_from and superseded_by to ModelMeta --- mteb/models/ibm_granite_models.py | 114 ++++++++++++++++++++++++++ mteb/models/overview.py | 2 + mteb/models/rerankers_monot5_based.py | 6 +- 3 files changed, 120 insertions(+), 2 deletions(-) create mode 100644 mteb/models/ibm_granite_models.py diff --git a/mteb/models/ibm_granite_models.py b/mteb/models/ibm_granite_models.py new file mode 100644 index 0000000000..c2443de233 --- /dev/null +++ b/mteb/models/ibm_granite_models.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +from functools import partial + +from mteb.model_meta import ModelMeta, sentence_transformers_loader + +GRANITE_LANGUAGES = [ + "ara_Latn", + "ces_Latn", + "deu_Latn", + "eng_Latn", + "spa_Latn", + "fra_Latn", + "ita_Latn", + "jpn_Latn", + "kor_Latn", + "nld_Latn", + "por_Latn", + "zho_Hant", + "zho_Hans", +] + + +granite_107m_multilingual = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="ibm-granite/granite-embedding-107m-multilingual", + revision="47db56afe692f731540413c67dd818ff492277e7", + ), + name="ibm-granite/granite-embedding-107m-multilingual", + languages=GRANITE_LANGUAGES, + open_weights=True, + revision="47db56afe692f731540413c67dd818ff492277e7", + release_date="2024-12-18", + n_parameters=107_000_000, + memory_usage=None, + embed_dim=384, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/ibm-granite/granite-embedding-107m-multilingual", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + adapted_from=None, + superseded_by=None, +) + +granite_278m_multilingual = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="ibm-granite/granite-embedding-278m-multilingual", + revision="84e3546b88b0cb69f8078608a1df558020bcbf1f", + ), + name="ibm-granite/granite-embedding-278m-multilingual", + languages=GRANITE_LANGUAGES, + open_weights=True, + revision="84e3546b88b0cb69f8078608a1df558020bcbf1f", + release_date="2024-12-18", + n_parameters=278_000_000, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/ibm-granite/granite-embedding-278m-multilingual", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + adapted_from=None, + superseded_by=None, +) + +granite_30m_english = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="ibm-granite/granite-embedding-30m-english", + revision="eddbb57470f896b5f8e2bfcb823d8f0e2d2024a5", + ), + name="ibm-granite/granite-embedding-30m-english", + languages=["eng_Latn"], + open_weights=True, + revision="eddbb57470f896b5f8e2bfcb823d8f0e2d2024a5", + release_date="2024-12-18", + n_parameters=30_000_000, + memory_usage=None, + embed_dim=384, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/ibm-granite/granite-embedding-30m-english", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + adapted_from=None, + superseded_by=None, +) + +granite_125m_english = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="ibm-granite/granite-embedding-125m-english", + revision="e48d3a5b47eaa18e3fe07d4676e187fd80f32730", + ), + name="ibm-granite/granite-embedding-125m-english", + languages=["eng_Latn"], + open_weights=True, + revision="e48d3a5b47eaa18e3fe07d4676e187fd80f32730", + release_date="2024-12-18", + n_parameters=125_000_000, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/ibm-granite/granite-embedding-125m-english", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + adapted_from=None, + superseded_by=None, +) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index f93c8862ff..5e6cd0184c 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -21,6 +21,7 @@ google_models, gritlm_models, gte_models, + ibm_granite_models, jina_models, linq_models, llm2vec_models, @@ -56,6 +57,7 @@ google_models, gritlm_models, gte_models, + ibm_granite_models, jina_models, linq_models, llm2vec_models, diff --git a/mteb/models/rerankers_monot5_based.py b/mteb/models/rerankers_monot5_based.py index b96897ee51..7ece40e3cf 100644 --- a/mteb/models/rerankers_monot5_based.py +++ b/mteb/models/rerankers_monot5_based.py @@ -94,8 +94,10 @@ def get_prediction_tokens( token_true_id = tokenizer.get_vocab()[token_true] return token_false_id, token_true_id else: - raise Exception(f"We don't know the indexes for the non-relevant/relevant tokens for\ - the checkpoint {model_name_or_path} and you did not provide any.") + raise Exception( + f"We don't know the indexes for the non-relevant/relevant tokens for\ + the checkpoint {model_name_or_path} and you did not provide any." + ) else: token_false_id = tokenizer.get_vocab()[token_false] token_true_id = tokenizer.get_vocab()[token_true] From 7c8e094743c236a46d892f7cfa59529d64ef141b Mon Sep 17 00:00:00 2001 From: Daniel Buades Marcos Date: Fri, 20 Dec 2024 16:31:29 +0100 Subject: [PATCH 21/31] fix: disable co2_tracker for API models (#1614) --- mteb/evaluation/MTEB.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index cf454fc25a..cb56df52b1 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -436,6 +436,10 @@ def run( if isinstance(model, (SentenceTransformer, CrossEncoder)): model = SentenceTransformerWrapper(model) + ## Disable co2_tracker for API models + if "API" in meta.framework: + co2_tracker = False + if output_path: self._save_model_metadata(meta, output_path) From d8c015fd5a87eb966df9521b7d9f6b5cfa76990b Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 20 Dec 2024 15:49:40 +0000 Subject: [PATCH 22/31] 1.25.2 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index dece91213a..80fdce98be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.25.1" +version = "1.25.2" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 0c444827f97bc558cbe5573714b3f4d9e7d745c0 Mon Sep 17 00:00:00 2001 From: Daniel Buades Marcos Date: Fri, 20 Dec 2024 20:21:49 +0100 Subject: [PATCH 23/31] fix: set `use_instructions` to True in models using prompts (#1616) feat: set `use_instructions` to True in models using prompts --- mteb/models/arctic_models.py | 16 ++++++++-------- mteb/models/bge_models.py | 6 +++--- mteb/models/cohere_models.py | 8 ++++---- mteb/models/jina_models.py | 2 +- mteb/models/ru_sentence_models.py | 5 +++-- mteb/models/uae_models.py | 2 +- mteb/models/voyage_models.py | 16 ++++++++-------- 7 files changed, 28 insertions(+), 27 deletions(-) diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py index eadc4065fe..9fae561107 100644 --- a/mteb/models/arctic_models.py +++ b/mteb/models/arctic_models.py @@ -100,7 +100,7 @@ license="apache-2.0", reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-xs", similarity_fn_name="cosine", - use_instructions=False, + use_instructions=True, adapted_from="sentence-transformers/all-MiniLM-L6-v2", superseded_by=None, ) @@ -125,7 +125,7 @@ license="apache-2.0", reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-s", similarity_fn_name="cosine", - use_instructions=False, + use_instructions=True, adapted_from="intfloat/e5-small-unsupervised", superseded_by=None, ) @@ -150,7 +150,7 @@ license="apache-2.0", reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-m", similarity_fn_name="cosine", - use_instructions=False, + use_instructions=True, adapted_from="intfloat/e5-base-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-m-v1.5", ) @@ -174,7 +174,7 @@ license="apache-2.0", reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-m-long", similarity_fn_name="cosine", - use_instructions=False, + use_instructions=True, adapted_from="nomic-ai/nomic-embed-text-v1-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", ) @@ -198,7 +198,7 @@ license="apache-2.0", reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-l", similarity_fn_name="cosine", - use_instructions=False, + use_instructions=True, adapted_from="intfloat/e5-base-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-l-v2.0", ) @@ -225,7 +225,7 @@ license="apache-2.0", reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5", similarity_fn_name="cosine", - use_instructions=False, + use_instructions=True, adapted_from=None, superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", ) @@ -249,7 +249,7 @@ license="apache-2.0", reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v2.0", similarity_fn_name="cosine", - use_instructions=False, + use_instructions=True, adapted_from="Alibaba-NLP/gte-multilingual-base", superseded_by=None, ) @@ -273,7 +273,7 @@ license="apache-2.0", reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0", similarity_fn_name="cosine", - use_instructions=False, + use_instructions=True, adapted_from="BAAI/bge-m3-retromae", superseded_by=None, ) diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index b643b4dfb6..cc183374c6 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -26,7 +26,7 @@ reference="https://huggingface.co/BAAI/bge-small-en-v1.5", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, + use_instructions=True, ) bge_base_en_v1_5 = ModelMeta( @@ -49,7 +49,7 @@ reference="https://huggingface.co/BAAI/bge-base-en-v1.5", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, + use_instructions=True, ) bge_large_en_v1_5 = ModelMeta( @@ -72,5 +72,5 @@ reference="https://huggingface.co/BAAI/bge-large-en-v1.5", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, + use_instructions=True, ) diff --git a/mteb/models/cohere_models.py b/mteb/models/cohere_models.py index 2a8aa1e3d3..43a797342d 100644 --- a/mteb/models/cohere_models.py +++ b/mteb/models/cohere_models.py @@ -234,7 +234,7 @@ def encode( license=None, similarity_fn_name="cosine", framework=["API"], - use_instructions=False, + use_instructions=True, ) cohere_eng_3 = ModelMeta( @@ -256,7 +256,7 @@ def encode( license=None, similarity_fn_name="cosine", framework=["API"], - use_instructions=False, + use_instructions=True, ) cohere_mult_light_3 = ModelMeta( @@ -278,7 +278,7 @@ def encode( license=None, similarity_fn_name="cosine", framework=["API"], - use_instructions=False, + use_instructions=True, ) cohere_eng_light_3 = ModelMeta( @@ -300,5 +300,5 @@ def encode( license=None, similarity_fn_name="cosine", framework=["API"], - use_instructions=False, + use_instructions=True, ) diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index 7b2c343a1d..122f190657 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -220,6 +220,6 @@ def encode( license="cc-by-nc-4.0", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, + use_instructions=True, reference="https://huggingface.co/jinaai/jina-embeddings-v3", ) diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index 301892e070..f90111b954 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -79,7 +79,7 @@ sentence_transformers_loader, model_name="deepvk/USER-base", revision="436a489a2087d61aa670b3496a9915f84e46c861", - prompts={"query": "query: ", "passage": "passage: "}, + model_prompts={"query": "query: ", "passage": "passage: "}, ), name="deepvk/USER-base", languages=["rus_Cyrl"], @@ -94,7 +94,7 @@ reference="https://huggingface.co/ai-forever/sbert_large_mt_nlu_ru", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, + use_instructions=True, ) deberta_v1_ru = ModelMeta( @@ -234,4 +234,5 @@ open_weights=True, revision="89fb1651989adbb1cfcfdedafd7d102951ad0555", release_date="2024-07-29", + use_instructions=True, ) diff --git a/mteb/models/uae_models.py b/mteb/models/uae_models.py index 942d508949..5c47cba67d 100644 --- a/mteb/models/uae_models.py +++ b/mteb/models/uae_models.py @@ -74,5 +74,5 @@ def encode( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], reference="https://huggingface.co/WhereIsAI/UAE-Large-V1", - use_instructions=False, + use_instructions=True, ) diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index 57453b0762..0e6ef71d94 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -181,7 +181,7 @@ def _batched_encode( reference="https://blog.voyageai.com/2024/06/03/domain-specific-embeddings-finance-edition-voyage-finance-2/", similarity_fn_name="cosine", framework=["API"], - use_instructions=False, + use_instructions=True, ) voyage_law_2 = ModelMeta( @@ -203,7 +203,7 @@ def _batched_encode( reference="https://blog.voyageai.com/2024/04/15/domain-specific-embeddings-and-retrieval-legal-edition-voyage-law-2/", similarity_fn_name="cosine", framework=["API"], - use_instructions=False, + use_instructions=True, ) voyage_code_2 = ModelMeta( @@ -225,7 +225,7 @@ def _batched_encode( reference="https://blog.voyageai.com/2024/01/23/voyage-code-2-elevate-your-code-retrieval/", similarity_fn_name="cosine", framework=["API"], - use_instructions=False, + use_instructions=True, ) voyage_large_2 = ModelMeta( @@ -247,7 +247,7 @@ def _batched_encode( reference="https://blog.voyageai.com/2023/10/29/voyage-embeddings/", similarity_fn_name="cosine", framework=["API"], - use_instructions=False, + use_instructions=True, ) voyage_2 = ModelMeta( @@ -269,7 +269,7 @@ def _batched_encode( reference="https://blog.voyageai.com/2023/10/29/voyage-embeddings/", similarity_fn_name="cosine", framework=["API"], - use_instructions=False, + use_instructions=True, ) voyage_multilingual_2 = ModelMeta( name="voyageai/voyage-multilingual-2", @@ -290,7 +290,7 @@ def _batched_encode( reference="https://blog.voyageai.com/2024/06/10/voyage-multilingual-2-multilingual-embedding-model/", similarity_fn_name="cosine", framework=["API"], - use_instructions=False, + use_instructions=True, ) voyage_3 = ModelMeta( @@ -312,7 +312,7 @@ def _batched_encode( reference="https://blog.voyageai.com/2024/09/18/voyage-3/", similarity_fn_name="cosine", framework=["API"], - use_instructions=False, + use_instructions=True, ) voyage_3_lite = ModelMeta( @@ -334,5 +334,5 @@ def _batched_encode( reference="https://blog.voyageai.com/2024/09/18/voyage-3/", similarity_fn_name="cosine", framework=["API"], - use_instructions=False, + use_instructions=True, ) From 2024338ec0dcf3a89ad3715174e13823f2d41b88 Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 20 Dec 2024 19:38:06 +0000 Subject: [PATCH 24/31] 1.25.3 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 80fdce98be..50261223a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.25.2" +version = "1.25.3" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From eb29eb3cec577420916a2a07d2d6b74e208e86c5 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sat, 21 Dec 2024 13:05:32 +0300 Subject: [PATCH 25/31] update RetrievalEvaluator.py --- .../evaluators/RetrievalEvaluator.py | 17 +----- mteb/evaluation/evaluators/model_classes.py | 55 +++++++------------ 2 files changed, 21 insertions(+), 51 deletions(-) diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py index 79a45c8556..5ad0b97530 100644 --- a/mteb/evaluation/evaluators/RetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py @@ -3,7 +3,7 @@ import logging from typing import Any -from mteb.evaluation.evaluators.model_classes import ( +from .model_classes import ( DenseRetrievalExactSearch, DRESModel, is_cross_encoder_compatible, @@ -14,10 +14,6 @@ add_task_specific_scores, calculate_retrieval_scores, evaluate_abstention, - confidence_scores, - convert_conv_history_to_query, - cos_sim, - download, hole, mrr, recall_cap, @@ -55,14 +51,6 @@ def __init__( self.top_k = ( max(k_values) if "top_k" not in kwargs else kwargs["top_k"] ) # can lower it if reranking - self.score_function = ( - retriever.mteb_model_meta.similarity_fn_name - if ( - hasattr(retriever, "mteb_model_meta") - and retriever.mteb_model_meta.similarity_fn_name - ) - else score_function - ) self.task_name = task_name def __call__( @@ -93,7 +81,6 @@ def __call__( corpus, queries, self.top_k, - self.score_function, task_name=self.task_name, # type: ignore instructions=instructions, **kwargs, @@ -103,7 +90,6 @@ def __call__( corpus, queries, self.top_k, - self.score_function, instructions=instructions, request_qid=qid, task_name=self.task_name, @@ -123,6 +109,7 @@ def evaluate( dict[str, float], dict[str, float], dict[str, float], + dict[str, float], ]: if ignore_identical_ids: logger.debug( diff --git a/mteb/evaluation/evaluators/model_classes.py b/mteb/evaluation/evaluators/model_classes.py index 146d529dc9..55c664f2c9 100644 --- a/mteb/evaluation/evaluators/model_classes.py +++ b/mteb/evaluation/evaluators/model_classes.py @@ -7,6 +7,7 @@ from pathlib import Path from typing import Any +import numpy as np import torch import tqdm from sentence_transformers import CrossEncoder, SentenceTransformer @@ -53,32 +54,24 @@ def __init__( ): # Model is class that provides encode_corpus() and encode_queries() self.model = model - self.encode_kwargs = encode_kwargs + self.encode_kwargs = encode_kwargs.copy() - if "batch_size" not in encode_kwargs: - encode_kwargs["batch_size"] = 128 if "show_progress_bar" not in encode_kwargs: - encode_kwargs["show_progress_bar"] = True - - self.score_functions = {"cos_sim": cos_sim, "dot": dot_score, "cosine": cos_sim} - self.score_function_desc = { - "cos_sim": "Cosine Similarity", - "cosine": "Cosine Similarity", - "dot": "Dot Product", - } + self.encode_kwargs["show_progress_bar"] = True + self.corpus_chunk_size = corpus_chunk_size if isinstance(previous_results, Path): self.previous_results = str(previous_results) else: self.previous_results = previous_results - self.batch_size = encode_kwargs.get("batch_size") - self.show_progress_bar = encode_kwargs.get("show_progress_bar") + self.batch_size = self.encode_kwargs.get("batch_size", 32) + self.show_progress_bar = self.encode_kwargs.get("show_progress_bar") self.results = {} if self.previous_results is not None: self.previous_results = self.load_results_file() - if isinstance(self.model, CrossEncoder): + if hasattr(self.model, "predict"): # load the predict instance from the CrossEncoder # custom functions can be used by extending the DenseRetrievalExactSearch class self.predict = self.model.predict @@ -88,7 +81,6 @@ def search( corpus: dict[str, dict[str, str]], queries: dict[str, str], top_k: int, - score_function: str, task_name: str, instructions: dict[str, str] | None = None, request_qid: str | None = None, @@ -102,7 +94,6 @@ def search( corpus: Dictionary mapping corpus IDs to document dictionaries queries: Dictionary mapping query IDs to query strings top_k: Number of top results to return - score_function: Scoring function to use ('cos_sim' or 'dot') task_name: Name of the task instructions: Optional instructions to append to queries request_qid: Optional request query ID @@ -110,11 +101,6 @@ def search( top_ranked: Optional dict mapping query IDs to lists of pre-ranked corpus IDs **kwargs: Additional keyword arguments passed to the underlying model """ - if score_function not in self.score_functions: - raise ValueError( - f"score function: {score_function} must be either (cos_sim) for cosine similarity or (dot) for dot product" - ) - logger.info("Encoding Queries.") query_ids = list(queries.keys()) self.results = {qid: {} for qid in query_ids} @@ -159,10 +145,6 @@ def search( # Map back to original order but reuse embeddings query_embeddings = unique_query_embeddings[query_idx_mapping] - logger.info( - f"Scoring Function: {self.score_function_desc[score_function]} ({score_function})" - ) - if top_ranked is not None: logger.info("Performing reranking on pre-ranked documents...") result_heaps = self._rerank_documents( @@ -171,7 +153,6 @@ def search( corpus=corpus, top_ranked=top_ranked, top_k=top_k, - score_function=score_function, task_name=task_name, request_qid=request_qid, return_sorted=return_sorted, @@ -183,7 +164,6 @@ def search( query_embeddings=query_embeddings, corpus=corpus, top_k=top_k, - score_function=score_function, task_name=task_name, request_qid=request_qid, return_sorted=return_sorted, @@ -198,11 +178,10 @@ def search( def _rerank_documents( self, query_ids: list[str], - query_embeddings: torch.Tensor, + query_embeddings: np.ndarray, corpus: dict[str, dict[str, str]], top_ranked: dict[str, list[str]], top_k: int, - score_function: str, task_name: str, request_qid: str | None = None, return_sorted: bool = False, @@ -259,8 +238,10 @@ def _rerank_documents( # Ensure query embedding is on the correct device and has correct shape query_embedding = query_embeddings[query_idx].unsqueeze(0) + score_function = self.model.score if hasattr(self.model, "score") else cos_sim + with torch.inference_mode(): - scores = self.score_functions[score_function]( + scores = score_function( query_embedding, query_doc_embeddings, ) @@ -305,7 +286,6 @@ def _full_corpus_search( query_embeddings: torch.Tensor, corpus: dict[str, dict[str, str]], top_k: int, - score_function: str, task_name: str, request_qid: str | None = None, return_sorted: bool = False, @@ -338,17 +318,20 @@ def _full_corpus_search( logging.info("Computing Similarities...") query_embeddings = torch.as_tensor(query_embeddings).to(device) sub_corpus_embeddings = torch.as_tensor(sub_corpus_embeddings).to(device) + + score_function = self.model.score if hasattr(self.model, "score") else cos_sim + with torch.inference_mode(): - cos_scores = self.score_functions[score_function]( + scores = score_function( query_embeddings, sub_corpus_embeddings ) # get top-k values cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk( - cos_scores, + scores, min( top_k + 1, - len(cos_scores[1]) if len(cos_scores) > 1 else len(cos_scores[-1]), + len(scores[1]) if len(scores) > 1 else len(scores[-1]), ), dim=1, largest=True, @@ -411,7 +394,7 @@ def search_cross_encoder( for qid in queries.keys(): if self.previous_results is None: # try to use all of them - logging.logging( + logging.info( f"previous_results is None. Using all the documents to rerank: {len(corpus)}" ) q_results = {doc_id: 0.0 for doc_id in corpus.keys()} @@ -461,7 +444,7 @@ def search_cross_encoder( len(queries_in_pair) == len(corpus_in_pair) == len(instructions_in_pair) ) - if isinstance(self.model.model, CrossEncoder): + if hasattr(self.model, "predict"): # can't take instructions, so add them here queries_in_pair = [ f"{q} {i}".strip() From 107dd4ae4e8a4cf6041a05ffb2bb6c7f22cfe209 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sat, 21 Dec 2024 13:06:22 +0300 Subject: [PATCH 26/31] update imports --- mteb/benchmarks/benchmarks.py | 48 ++++++++++++++-------------- mteb/models/arctic_models.py | 1 + mteb/tasks/Reranking/__init__.py | 2 ++ mteb/tasks/Reranking/ara/__init__.py | 5 +++ mteb/tasks/Retrieval/__init__.py | 26 +++++++++++++++ mteb/tasks/Retrieval/eng/__init__.py | 26 +++++++++++++++ mteb/tasks/__init__.py | 28 ++++++++++++++++ scripts/generate_imports.py | 3 +- tests/test_TaskMetadata.py | 2 -- 9 files changed, 113 insertions(+), 28 deletions(-) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 0954c2de26..0589baf8ca 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -979,27 +979,27 @@ def load_results( year={2024} }""", ) - -NANOBEIR = Benchmark( - name="NanoBEIR", - tasks=get_tasks( - tasks=[ - "NanoArguAnaRetrieval", - "NanoClimateFeverRetrieval", - "NanoDBPediaRetrieval", - "NanoFEVERRetrieval", - "NanoFiQA2018Retrieval", - "NanoHotpotQARetrieval", - "NanoMSMARCORetrieval", - "NanoNFCorpusRetrieval", - "NanoNQRetrieval", - "NanoQuoraRetrieval", - "NanoSCIDOCSRetrieval", - "NanoSciFactRetrieval", - "NanoTouche2020Retrieval", - ], - ), - description="A benchmark to evaluate with subsets of BEIR datasets to use less computational power", - reference="https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6", - citation=None, -) +# +# NANOBEIR = Benchmark( +# name="NanoBEIR", +# tasks=get_tasks( +# tasks=[ +# "NanoArguAnaRetrieval", +# "NanoClimateFeverRetrieval", +# "NanoDBPediaRetrieval", +# "NanoFEVERRetrieval", +# "NanoFiQA2018Retrieval", +# "NanoHotpotQARetrieval", +# "NanoMSMARCORetrieval", +# "NanoNFCorpusRetrieval", +# "NanoNQRetrieval", +# "NanoQuoraRetrieval", +# "NanoSCIDOCSRetrieval", +# "NanoSciFactRetrieval", +# "NanoTouche2020Retrieval", +# ], +# ), +# description="A benchmark to evaluate with subsets of BEIR datasets to use less computational power", +# reference="https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6", +# citation=None, +# ) diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py index c07b7a728b..6240962b27 100644 --- a/mteb/models/arctic_models.py +++ b/mteb/models/arctic_models.py @@ -3,6 +3,7 @@ from functools import partial from mteb.model_meta import ModelMeta, sentence_transformers_loader + LANGUAGES_V2_0 = [ "afr_Latn", "ara_Arab", diff --git a/mteb/tasks/Reranking/__init__.py b/mteb/tasks/Reranking/__init__.py index 497e2751b8..1e34adfc44 100644 --- a/mteb/tasks/Reranking/__init__.py +++ b/mteb/tasks/Reranking/__init__.py @@ -1,5 +1,6 @@ from __future__ import annotations +from .ara import NamaaMrTydiReranking from .eng import ( AskUbuntuDupQuestions, MindSmallReranking, @@ -19,6 +20,7 @@ "CMedQAv2", "MMarcoReranking", "T2Reranking", + "NamaaMrTydiReranking", "AskUbuntuDupQuestions", "WebLINXCandidatesReranking", "StackOverflowDupQuestions", diff --git a/mteb/tasks/Reranking/ara/__init__.py b/mteb/tasks/Reranking/ara/__init__.py index e69de29bb2..8f56ceada7 100644 --- a/mteb/tasks/Reranking/ara/__init__.py +++ b/mteb/tasks/Reranking/ara/__init__.py @@ -0,0 +1,5 @@ +from __future__ import annotations + +from .NamaaMrTydiReranking import NamaaMrTydiReranking + +__all__ = ["NamaaMrTydiReranking"] diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index 859e8d3a49..7118699ac8 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -77,6 +77,19 @@ MLQuestionsRetrieval, MSMARCOHardNegatives, MSMARCOv2, + NanoArguAnaRetrieval, + NanoClimateFeverRetrieval, + NanoDBPediaRetrieval, + NanoFEVERRetrieval, + NanoFiQA2018Retrieval, + NanoHotpotQARetrieval, + NanoMSMARCORetrieval, + NanoNFCorpusRetrieval, + NanoNQRetrieval, + NanoQuoraRetrieval, + NanoSCIDOCSRetrieval, + NanoSciFactRetrieval, + NanoTouche2020Retrieval, NarrativeQARetrieval, NFCorpus, NQHardNegatives, @@ -246,6 +259,7 @@ "LEMBPasskeyRetrieval", "CQADupstackAndroidRetrieval", "TempReasonL2Context", + "NanoDBPediaRetrieval", "ARCChallenge", "LegalBenchCorporateLobbying", "SCIDOCS", @@ -260,22 +274,29 @@ "ClimateFEVERHardNegatives", "CQADupstackWordpressRetrieval", "CQADupstackEnglishRetrieval", + "NanoTouche2020Retrieval", "CQADupstackStatsRetrieval", "MLQuestionsRetrieval", "TempReasonL2Fact", + "NanoSciFactRetrieval", "CQADupstackGamingRetrieval", "CQADupstackWebmastersRetrieval", + "NanoFiQA2018Retrieval", "CQADupstackUnixRetrieval", "TempReasonL3Pure", "CQADupstackPhysicsRetrieval", "FiQA2018", "LitSearchRetrieval", + "NanoFEVERRetrieval", + "NanoMSMARCORetrieval", "FeedbackQARetrieval", "HagridRetrieval", + "NanoNFCorpusRetrieval", "FaithDialRetrieval", "SciFact", "CQADupstackMathematicaRetrieval", "RARbMath", + "NanoNQRetrieval", "HellaSwag", "PIQA", "SpartQA", @@ -283,13 +304,18 @@ "TempReasonL1", "HotpotQA", "HotpotQAHardNegatives", + "NanoClimateFeverRetrieval", + "NanoQuoraRetrieval", + "NanoArguAnaRetrieval", "LegalBenchConsumerContractsQA", + "NanoHotpotQARetrieval", "ArguAna", "LEMBWikimQARetrieval", "TempReasonL3Fact", "FEVER", "FEVERHardNegatives", "CQADupstackGisRetrieval", + "NanoSCIDOCSRetrieval", "AILACasedocs", "NFCorpus", "LEMBSummScreenFDRetrieval", diff --git a/mteb/tasks/Retrieval/eng/__init__.py b/mteb/tasks/Retrieval/eng/__init__.py index 47e2498709..9f1717a89f 100644 --- a/mteb/tasks/Retrieval/eng/__init__.py +++ b/mteb/tasks/Retrieval/eng/__init__.py @@ -41,6 +41,19 @@ from .MLQuestions import MLQuestionsRetrieval from .MSMARCORetrieval import MSMARCO, MSMARCOHardNegatives from .MSMARCOv2Retrieval import MSMARCOv2 +from .NanoArguAnaRetrieval import NanoArguAnaRetrieval +from .NanoClimateFeverRetrieval import NanoClimateFeverRetrieval +from .NanoDBPediaRetrieval import NanoDBPediaRetrieval +from .NanoFEVERRetrieval import NanoFEVERRetrieval +from .NanoFiQA2018Retrieval import NanoFiQA2018Retrieval +from .NanoHotpotQARetrieval import NanoHotpotQARetrieval +from .NanoMSMARCORetrieval import NanoMSMARCORetrieval +from .NanoNFCorpusRetrieval import NanoNFCorpusRetrieval +from .NanoNQRetrieval import NanoNQRetrieval +from .NanoQuoraRetrieval import NanoQuoraRetrieval +from .NanoSCIDOCSRetrieval import NanoSCIDOCSRetrieval +from .NanoSciFactRetrieval import NanoSciFactRetrieval +from .NanoTouche2020Retrieval import NanoTouche2020Retrieval from .NarrativeQARetrieval import NarrativeQARetrieval from .NFCorpusRetrieval import NFCorpus from .NQRetrieval import NQ, NQHardNegatives @@ -90,6 +103,7 @@ "LEMBPasskeyRetrieval", "CQADupstackAndroidRetrieval", "TempReasonL2Context", + "NanoDBPediaRetrieval", "ARCChallenge", "LegalBenchCorporateLobbying", "SCIDOCS", @@ -104,22 +118,29 @@ "ClimateFEVERHardNegatives", "CQADupstackWordpressRetrieval", "CQADupstackEnglishRetrieval", + "NanoTouche2020Retrieval", "CQADupstackStatsRetrieval", "MLQuestionsRetrieval", "TempReasonL2Fact", + "NanoSciFactRetrieval", "CQADupstackGamingRetrieval", "CQADupstackWebmastersRetrieval", + "NanoFiQA2018Retrieval", "CQADupstackUnixRetrieval", "TempReasonL3Pure", "CQADupstackPhysicsRetrieval", "FiQA2018", "LitSearchRetrieval", + "NanoFEVERRetrieval", + "NanoMSMARCORetrieval", "FeedbackQARetrieval", "HagridRetrieval", + "NanoNFCorpusRetrieval", "FaithDialRetrieval", "SciFact", "CQADupstackMathematicaRetrieval", "RARbMath", + "NanoNQRetrieval", "HellaSwag", "PIQA", "SpartQA", @@ -127,13 +148,18 @@ "TempReasonL1", "HotpotQA", "HotpotQAHardNegatives", + "NanoClimateFeverRetrieval", + "NanoQuoraRetrieval", + "NanoArguAnaRetrieval", "LegalBenchConsumerContractsQA", + "NanoHotpotQARetrieval", "ArguAna", "LEMBWikimQARetrieval", "TempReasonL3Fact", "FEVER", "FEVERHardNegatives", "CQADupstackGisRetrieval", + "NanoSCIDOCSRetrieval", "AILACasedocs", "NFCorpus", "LEMBSummScreenFDRetrieval", diff --git a/mteb/tasks/__init__.py b/mteb/tasks/__init__.py index 6e09541e10..745d4066ae 100644 --- a/mteb/tasks/__init__.py +++ b/mteb/tasks/__init__.py @@ -426,6 +426,7 @@ MindSmallReranking, MIRACLReranking, MMarcoReranking, + NamaaMrTydiReranking, NevIR, RuBQReranking, SciDocsReranking, @@ -547,6 +548,19 @@ MSMARCOPLHardNegatives, MSMARCOv2, MultiLongDocRetrieval, + NanoArguAnaRetrieval, + NanoClimateFeverRetrieval, + NanoDBPediaRetrieval, + NanoFEVERRetrieval, + NanoFiQA2018Retrieval, + NanoHotpotQARetrieval, + NanoMSMARCORetrieval, + NanoNFCorpusRetrieval, + NanoNQRetrieval, + NanoQuoraRetrieval, + NanoSCIDOCSRetrieval, + NanoSciFactRetrieval, + NanoTouche2020Retrieval, NarrativeQARetrieval, NeuCLIR2022Retrieval, NeuCLIR2022RetrievalHardNegatives, @@ -1140,6 +1154,7 @@ "LEMBPasskeyRetrieval", "CQADupstackAndroidRetrieval", "TempReasonL2Context", + "NanoDBPediaRetrieval", "ARCChallenge", "LegalBenchCorporateLobbying", "SCIDOCS", @@ -1154,22 +1169,29 @@ "ClimateFEVERHardNegatives", "CQADupstackWordpressRetrieval", "CQADupstackEnglishRetrieval", + "NanoTouche2020Retrieval", "CQADupstackStatsRetrieval", "MLQuestionsRetrieval", "TempReasonL2Fact", + "NanoSciFactRetrieval", "CQADupstackGamingRetrieval", "CQADupstackWebmastersRetrieval", + "NanoFiQA2018Retrieval", "CQADupstackUnixRetrieval", "TempReasonL3Pure", "CQADupstackPhysicsRetrieval", "FiQA2018", "LitSearchRetrieval", + "NanoFEVERRetrieval", + "NanoMSMARCORetrieval", "FeedbackQARetrieval", "HagridRetrieval", + "NanoNFCorpusRetrieval", "FaithDialRetrieval", "SciFact", "CQADupstackMathematicaRetrieval", "RARbMath", + "NanoNQRetrieval", "HellaSwag", "PIQA", "SpartQA", @@ -1177,13 +1199,18 @@ "TempReasonL1", "HotpotQA", "HotpotQAHardNegatives", + "NanoClimateFeverRetrieval", + "NanoQuoraRetrieval", + "NanoArguAnaRetrieval", "LegalBenchConsumerContractsQA", + "NanoHotpotQARetrieval", "ArguAna", "LEMBWikimQARetrieval", "TempReasonL3Fact", "FEVER", "FEVERHardNegatives", "CQADupstackGisRetrieval", + "NanoSCIDOCSRetrieval", "AILACasedocs", "NFCorpus", "LEMBSummScreenFDRetrieval", @@ -1280,6 +1307,7 @@ "CMedQAv2", "MMarcoReranking", "T2Reranking", + "NamaaMrTydiReranking", "AskUbuntuDupQuestions", "WebLINXCandidatesReranking", "StackOverflowDupQuestions", diff --git a/scripts/generate_imports.py b/scripts/generate_imports.py index 469d894441..ae331de9df 100644 --- a/scripts/generate_imports.py +++ b/scripts/generate_imports.py @@ -7,8 +7,7 @@ import types from pathlib import Path -# Adjust this import to the correct location of AbsTask. -from mteb.tasks import AbsTask +from mteb.abstasks import AbsTask BASE_DIR = Path("../mteb/tasks") diff --git a/tests/test_TaskMetadata.py b/tests/test_TaskMetadata.py index 822e7aa599..701abab729 100644 --- a/tests/test_TaskMetadata.py +++ b/tests/test_TaskMetadata.py @@ -4,8 +4,6 @@ from pydantic import ValidationError from mteb.abstasks import AbsTask, TaskMetadata -from mteb import AbsTask -from mteb.abstasks.TaskMetadata import TaskMetadata from mteb.overview import get_tasks # Historic datasets without filled metadata. Do NOT add new datasets to this list. From 92dba392c730e216b1e2f55a512ce212c7e66387 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sat, 21 Dec 2024 13:12:43 +0300 Subject: [PATCH 27/31] update imports and metadata --- mteb/abstasks/AbsTaskReranking.py | 1 + .../Reranking/NamaaMrTydiReranking.json | 31 +++++++++++++++++++ .../Retrieval/NanoArguAnaRetrieval.json | 31 +++++++++++++++++++ .../Retrieval/NanoClimateFeverRetrieval.json | 31 +++++++++++++++++++ .../Retrieval/NanoDBPediaRetrieval.json | 31 +++++++++++++++++++ .../Retrieval/NanoFEVERRetrieval.json | 31 +++++++++++++++++++ .../Retrieval/NanoFiQA2018Retrieval.json | 31 +++++++++++++++++++ .../Retrieval/NanoHotpotQARetrieval.json | 31 +++++++++++++++++++ .../Retrieval/NanoMSMARCORetrieval.json | 31 +++++++++++++++++++ .../Retrieval/NanoNFCorpusRetrieval.json | 31 +++++++++++++++++++ .../Retrieval/NanoNQRetrieval.json | 31 +++++++++++++++++++ .../Retrieval/NanoQuoraRetrieval.json | 31 +++++++++++++++++++ .../Retrieval/NanoSCIDOCSRetrieval.json | 31 +++++++++++++++++++ .../Retrieval/NanoSciFactRetrieval.json | 31 +++++++++++++++++++ .../Retrieval/NanoTouche2020Retrieval.json | 31 +++++++++++++++++++ .../evaluators/RetrievalEvaluator.py | 3 +- mteb/evaluation/evaluators/model_classes.py | 16 +++++----- 17 files changed, 445 insertions(+), 9 deletions(-) create mode 100644 mteb/descriptive_stats/Reranking/NamaaMrTydiReranking.json create mode 100644 mteb/descriptive_stats/Retrieval/NanoArguAnaRetrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/NanoClimateFeverRetrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/NanoDBPediaRetrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/NanoFEVERRetrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/NanoFiQA2018Retrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/NanoHotpotQARetrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/NanoMSMARCORetrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/NanoNFCorpusRetrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/NanoNQRetrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/NanoQuoraRetrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/NanoSCIDOCSRetrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/NanoSciFactRetrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/NanoTouche2020Retrieval.json diff --git a/mteb/abstasks/AbsTaskReranking.py b/mteb/abstasks/AbsTaskReranking.py index 2ae1f5c359..e8ea495366 100644 --- a/mteb/abstasks/AbsTaskReranking.py +++ b/mteb/abstasks/AbsTaskReranking.py @@ -26,6 +26,7 @@ "MMarcoReranking", "CMedQAv1-reranking", "CMedQAv2-reranking", + "NamaaMrTydiReranking", ] diff --git a/mteb/descriptive_stats/Reranking/NamaaMrTydiReranking.json b/mteb/descriptive_stats/Reranking/NamaaMrTydiReranking.json new file mode 100644 index 0000000000..74da38e4f0 --- /dev/null +++ b/mteb/descriptive_stats/Reranking/NamaaMrTydiReranking.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 5504, + "number_of_characters": 1293166, + "num_documents": 4586, + "min_document_length": 0, + "average_document_length": 275.8353685128652, + "max_document_length": 4158, + "unique_documents": 4586, + "num_queries": 918, + "min_query_length": 13, + "average_query_length": 30.702614379084967, + "max_query_length": 93, + "unique_queries": 918, + "none_queries": 0, + "num_relevant_docs": 4586, + "min_relevant_docs_per_query": 2, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 6, + "unique_relevant_docs": 4586, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": 918, + "min_top_ranked_per_query": 2, + "average_top_ranked_per_query": 4.995642701525054, + "max_top_ranked_per_query": 6 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/NanoArguAnaRetrieval.json b/mteb/descriptive_stats/Retrieval/NanoArguAnaRetrieval.json new file mode 100644 index 0000000000..51d1be4353 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/NanoArguAnaRetrieval.json @@ -0,0 +1,31 @@ +{ + "train": { + "num_samples": 3685, + "number_of_characters": 3737951, + "num_documents": 3635, + "min_document_length": 70, + "average_document_length": 1011.7914718019257, + "max_document_length": 6673, + "unique_documents": 3635, + "num_queries": 50, + "min_query_length": 504, + "average_query_length": 1201.78, + "max_query_length": 2164, + "unique_queries": 50, + "none_queries": 0, + "num_relevant_docs": 50, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 50, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/NanoClimateFeverRetrieval.json b/mteb/descriptive_stats/Retrieval/NanoClimateFeverRetrieval.json new file mode 100644 index 0000000000..5a408ec517 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/NanoClimateFeverRetrieval.json @@ -0,0 +1,31 @@ +{ + "train": { + "num_samples": 3458, + "number_of_characters": 5525784, + "num_documents": 3408, + "min_document_length": 33, + "average_document_length": 1619.531690140845, + "max_document_length": 6619, + "unique_documents": 3408, + "num_queries": 50, + "min_query_length": 38, + "average_query_length": 128.4, + "max_query_length": 265, + "unique_queries": 50, + "none_queries": 0, + "num_relevant_docs": 50, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 38, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/NanoDBPediaRetrieval.json b/mteb/descriptive_stats/Retrieval/NanoDBPediaRetrieval.json new file mode 100644 index 0000000000..cd6f035639 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/NanoDBPediaRetrieval.json @@ -0,0 +1,31 @@ +{ + "train": { + "num_samples": 6095, + "number_of_characters": 2034629, + "num_documents": 6045, + "min_document_length": 1, + "average_document_length": 336.30669975186106, + "max_document_length": 1390, + "unique_documents": 6045, + "num_queries": 50, + "min_query_length": 8, + "average_query_length": 33.1, + "max_query_length": 63, + "unique_queries": 50, + "none_queries": 0, + "num_relevant_docs": 50, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 50, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/NanoFEVERRetrieval.json b/mteb/descriptive_stats/Retrieval/NanoFEVERRetrieval.json new file mode 100644 index 0000000000..eb7f3d6e95 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/NanoFEVERRetrieval.json @@ -0,0 +1,31 @@ +{ + "train": { + "num_samples": 5046, + "number_of_characters": 6140916, + "num_documents": 4996, + "min_document_length": 25, + "average_document_length": 1228.7119695756605, + "max_document_length": 8491, + "unique_documents": 4996, + "num_queries": 50, + "min_query_length": 17, + "average_query_length": 45.42, + "max_query_length": 83, + "unique_queries": 50, + "none_queries": 0, + "num_relevant_docs": 50, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 50, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/NanoFiQA2018Retrieval.json b/mteb/descriptive_stats/Retrieval/NanoFiQA2018Retrieval.json new file mode 100644 index 0000000000..92c11900a9 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/NanoFiQA2018Retrieval.json @@ -0,0 +1,31 @@ +{ + "train": { + "num_samples": 4648, + "number_of_characters": 4139437, + "num_documents": 4598, + "min_document_length": 0, + "average_document_length": 899.6326663766855, + "max_document_length": 10506, + "unique_documents": 4598, + "num_queries": 50, + "min_query_length": 18, + "average_query_length": 58.52, + "max_query_length": 97, + "unique_queries": 50, + "none_queries": 0, + "num_relevant_docs": 50, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 50, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/NanoHotpotQARetrieval.json b/mteb/descriptive_stats/Retrieval/NanoHotpotQARetrieval.json new file mode 100644 index 0000000000..ec35252f78 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/NanoHotpotQARetrieval.json @@ -0,0 +1,31 @@ +{ + "train": { + "num_samples": 5140, + "number_of_characters": 1784059, + "num_documents": 5090, + "min_document_length": 24, + "average_document_length": 349.6349705304519, + "max_document_length": 2079, + "unique_documents": 5090, + "num_queries": 50, + "min_query_length": 37, + "average_query_length": 88.34, + "max_query_length": 184, + "unique_queries": 50, + "none_queries": 0, + "num_relevant_docs": 50, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 50, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/NanoMSMARCORetrieval.json b/mteb/descriptive_stats/Retrieval/NanoMSMARCORetrieval.json new file mode 100644 index 0000000000..2deebc7918 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/NanoMSMARCORetrieval.json @@ -0,0 +1,31 @@ +{ + "train": { + "num_samples": 5093, + "number_of_characters": 1666607, + "num_documents": 5043, + "min_document_length": 32, + "average_document_length": 330.159825500694, + "max_document_length": 990, + "unique_documents": 5043, + "num_queries": 50, + "min_query_length": 13, + "average_query_length": 32.22, + "max_query_length": 101, + "unique_queries": 50, + "none_queries": 0, + "num_relevant_docs": 50, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 50, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/NanoNFCorpusRetrieval.json b/mteb/descriptive_stats/Retrieval/NanoNFCorpusRetrieval.json new file mode 100644 index 0000000000..b6e2e3fb1a --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/NanoNFCorpusRetrieval.json @@ -0,0 +1,31 @@ +{ + "train": { + "num_samples": 3003, + "number_of_characters": 4468144, + "num_documents": 2953, + "min_document_length": 90, + "average_document_length": 1512.7301049779885, + "max_document_length": 9939, + "unique_documents": 2953, + "num_queries": 50, + "min_query_length": 4, + "average_query_length": 21.04, + "max_query_length": 53, + "unique_queries": 50, + "none_queries": 0, + "num_relevant_docs": 50, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 42, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/NanoNQRetrieval.json b/mteb/descriptive_stats/Retrieval/NanoNQRetrieval.json new file mode 100644 index 0000000000..254a0ac40f --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/NanoNQRetrieval.json @@ -0,0 +1,31 @@ +{ + "train": { + "num_samples": 5085, + "number_of_characters": 2648727, + "num_documents": 5035, + "min_document_length": 1, + "average_document_length": 525.5958291956306, + "max_document_length": 6138, + "unique_documents": 5035, + "num_queries": 50, + "min_query_length": 32, + "average_query_length": 47.04, + "max_query_length": 83, + "unique_queries": 50, + "none_queries": 0, + "num_relevant_docs": 50, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 50, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/NanoQuoraRetrieval.json b/mteb/descriptive_stats/Retrieval/NanoQuoraRetrieval.json new file mode 100644 index 0000000000..540b0fd3aa --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/NanoQuoraRetrieval.json @@ -0,0 +1,31 @@ +{ + "train": { + "num_samples": 5096, + "number_of_characters": 278960, + "num_documents": 5046, + "min_document_length": 2, + "average_document_length": 54.808164883075705, + "max_document_length": 332, + "unique_documents": 5046, + "num_queries": 50, + "min_query_length": 19, + "average_query_length": 47.96, + "max_query_length": 139, + "unique_queries": 50, + "none_queries": 0, + "num_relevant_docs": 50, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 50, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/NanoSCIDOCSRetrieval.json b/mteb/descriptive_stats/Retrieval/NanoSCIDOCSRetrieval.json new file mode 100644 index 0000000000..78e927e208 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/NanoSCIDOCSRetrieval.json @@ -0,0 +1,31 @@ +{ + "train": { + "num_samples": 2260, + "number_of_characters": 2044730, + "num_documents": 2210, + "min_document_length": 0, + "average_document_length": 923.5705882352942, + "max_document_length": 10000, + "unique_documents": 2210, + "num_queries": 50, + "min_query_length": 38, + "average_query_length": 72.78, + "max_query_length": 143, + "unique_queries": 50, + "none_queries": 0, + "num_relevant_docs": 50, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 50, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/NanoSciFactRetrieval.json b/mteb/descriptive_stats/Retrieval/NanoSciFactRetrieval.json new file mode 100644 index 0000000000..00e8cb4be1 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/NanoSciFactRetrieval.json @@ -0,0 +1,31 @@ +{ + "train": { + "num_samples": 2969, + "number_of_characters": 4182563, + "num_documents": 2919, + "min_document_length": 260, + "average_document_length": 1431.2343268242548, + "max_document_length": 10000, + "unique_documents": 2919, + "num_queries": 50, + "min_query_length": 37, + "average_query_length": 95.8, + "max_query_length": 200, + "unique_queries": 50, + "none_queries": 0, + "num_relevant_docs": 50, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 50, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/NanoTouche2020Retrieval.json b/mteb/descriptive_stats/Retrieval/NanoTouche2020Retrieval.json new file mode 100644 index 0000000000..de076dae57 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/NanoTouche2020Retrieval.json @@ -0,0 +1,31 @@ +{ + "train": { + "num_samples": 5794, + "number_of_characters": 12311190, + "num_documents": 5745, + "min_document_length": 3, + "average_document_length": 2142.56953872933, + "max_document_length": 37100, + "unique_documents": 5745, + "num_queries": 49, + "min_query_length": 16, + "average_query_length": 43.42857142857143, + "max_query_length": 83, + "unique_queries": 49, + "none_queries": 0, + "num_relevant_docs": 49, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 49, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py index 5ad0b97530..be2f5af1f0 100644 --- a/mteb/evaluation/evaluators/RetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py @@ -3,13 +3,12 @@ import logging from typing import Any +from .Evaluator import Evaluator from .model_classes import ( DenseRetrievalExactSearch, DRESModel, is_cross_encoder_compatible, ) - -from .Evaluator import Evaluator from .utils import ( add_task_specific_scores, calculate_retrieval_scores, diff --git a/mteb/evaluation/evaluators/model_classes.py b/mteb/evaluation/evaluators/model_classes.py index 55c664f2c9..19c5121c9d 100644 --- a/mteb/evaluation/evaluators/model_classes.py +++ b/mteb/evaluation/evaluators/model_classes.py @@ -10,12 +10,12 @@ import numpy as np import torch import tqdm -from sentence_transformers import CrossEncoder, SentenceTransformer +from sentence_transformers import SentenceTransformer from mteb.encoder_interface import Encoder, PromptType from mteb.model_meta import ModelMeta -from .utils import convert_conv_history_to_query, cos_sim, dot_score, download +from .utils import convert_conv_history_to_query, cos_sim, download logger = logging.getLogger(__name__) @@ -238,7 +238,9 @@ def _rerank_documents( # Ensure query embedding is on the correct device and has correct shape query_embedding = query_embeddings[query_idx].unsqueeze(0) - score_function = self.model.score if hasattr(self.model, "score") else cos_sim + score_function = ( + self.model.score if hasattr(self.model, "score") else cos_sim + ) with torch.inference_mode(): scores = score_function( @@ -319,12 +321,12 @@ def _full_corpus_search( query_embeddings = torch.as_tensor(query_embeddings).to(device) sub_corpus_embeddings = torch.as_tensor(sub_corpus_embeddings).to(device) - score_function = self.model.score if hasattr(self.model, "score") else cos_sim + score_function = ( + self.model.score if hasattr(self.model, "score") else cos_sim + ) with torch.inference_mode(): - scores = score_function( - query_embeddings, sub_corpus_embeddings - ) + scores = score_function(query_embeddings, sub_corpus_embeddings) # get top-k values cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk( From 7b4ae88e145bae39851f9f8219af39dc30d3af05 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sat, 21 Dec 2024 13:29:18 +0300 Subject: [PATCH 28/31] fix tests --- mteb/benchmarks/benchmarks.py | 48 ++++++++--------- mteb/evaluation/MTEB.py | 2 - tests/test_benchmark/mock_tasks.py | 87 ++++++++++++++++++++---------- 3 files changed, 84 insertions(+), 53 deletions(-) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 0589baf8ca..0954c2de26 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -979,27 +979,27 @@ def load_results( year={2024} }""", ) -# -# NANOBEIR = Benchmark( -# name="NanoBEIR", -# tasks=get_tasks( -# tasks=[ -# "NanoArguAnaRetrieval", -# "NanoClimateFeverRetrieval", -# "NanoDBPediaRetrieval", -# "NanoFEVERRetrieval", -# "NanoFiQA2018Retrieval", -# "NanoHotpotQARetrieval", -# "NanoMSMARCORetrieval", -# "NanoNFCorpusRetrieval", -# "NanoNQRetrieval", -# "NanoQuoraRetrieval", -# "NanoSCIDOCSRetrieval", -# "NanoSciFactRetrieval", -# "NanoTouche2020Retrieval", -# ], -# ), -# description="A benchmark to evaluate with subsets of BEIR datasets to use less computational power", -# reference="https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6", -# citation=None, -# ) + +NANOBEIR = Benchmark( + name="NanoBEIR", + tasks=get_tasks( + tasks=[ + "NanoArguAnaRetrieval", + "NanoClimateFeverRetrieval", + "NanoDBPediaRetrieval", + "NanoFEVERRetrieval", + "NanoFiQA2018Retrieval", + "NanoHotpotQARetrieval", + "NanoMSMARCORetrieval", + "NanoNFCorpusRetrieval", + "NanoNQRetrieval", + "NanoQuoraRetrieval", + "NanoSCIDOCSRetrieval", + "NanoSciFactRetrieval", + "NanoTouche2020Retrieval", + ], + ), + description="A benchmark to evaluate with subsets of BEIR datasets to use less computational power", + reference="https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6", + citation=None, +) diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index cf004b95ed..faad939015 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -300,7 +300,6 @@ def _run_eval( task: AbsTask, model: Encoder, split: str, - output_folder: str | None, subsets_to_run: list[str] | None = None, *, encode_kwargs: dict[str, Any], @@ -311,7 +310,6 @@ def _run_eval( model, split, subsets_to_run=subsets_to_run, - output_folder=output_folder, encode_kwargs=encode_kwargs, **kwargs, ) diff --git a/tests/test_benchmark/mock_tasks.py b/tests/test_benchmark/mock_tasks.py index 7c00e1bb4d..142b4b42ad 100644 --- a/tests/test_benchmark/mock_tasks.py +++ b/tests/test_benchmark/mock_tasks.py @@ -1448,58 +1448,91 @@ def load_data(self, **kwargs): class MockMultilingualRetrievalTask(AbsTaskRetrieval, MultilingualTask): expected_stats = { "val": { - "number_of_characters": 224, "num_samples": 8, - "num_queries": 4, + "number_of_characters": 224, "num_documents": 4, - "min_document_length": 23, - "average_document_length": 26.0, - "max_document_length": 29, + "min_document_length": 27, + "average_document_length": 30.0, + "max_document_length": 33, "unique_documents": 4, - "min_query_length": 27, - "average_query_length": 30.0, - "max_query_length": 33, + "num_queries": 4, + "min_query_length": 23, + "average_query_length": 26.0, + "max_query_length": 29, "unique_queries": 4, + "none_queries": 0, + "num_relevant_docs": 8, "min_relevant_docs_per_query": 2, - "average_relevant_docs_per_query": 2.0, + "average_relevant_docs_per_query": 1.0, "max_relevant_docs_per_query": 2, "unique_relevant_docs": 4, + "num_instructions": None, + "min_instruction_length": None, + "average_instruction_length": None, + "max_instruction_length": None, + "unique_instructions": None, + "num_top_ranked": None, + "min_top_ranked_per_query": None, + "average_top_ranked_per_query": None, + "max_top_ranked_per_query": None, "hf_subset_descriptive_stats": { "eng": { - "number_of_characters": 112, "num_samples": 4, - "num_queries": 2, + "number_of_characters": 112, "num_documents": 2, - "min_document_length": 23, - "average_document_length": 26.0, - "max_document_length": 29, + "min_document_length": 27, + "average_document_length": 30.0, + "max_document_length": 33, "unique_documents": 2, - "min_query_length": 27, - "average_query_length": 30.0, - "max_query_length": 33, + "num_queries": 2, + "min_query_length": 23, + "average_query_length": 26.0, + "max_query_length": 29, "unique_queries": 2, + "none_queries": 0, + "num_relevant_docs": 4, "min_relevant_docs_per_query": 2, - "average_relevant_docs_per_query": 2.0, + "average_relevant_docs_per_query": 1.0, "max_relevant_docs_per_query": 2, "unique_relevant_docs": 2, + "num_instructions": None, + "min_instruction_length": None, + "average_instruction_length": None, + "max_instruction_length": None, + "unique_instructions": None, + "num_top_ranked": None, + "min_top_ranked_per_query": None, + "average_top_ranked_per_query": None, + "max_top_ranked_per_query": None, }, "fra": { - "number_of_characters": 112, "num_samples": 4, - "num_queries": 2, + "number_of_characters": 112, "num_documents": 2, - "min_document_length": 23, - "average_document_length": 26.0, - "max_document_length": 29, + "min_document_length": 27, + "average_document_length": 30.0, + "max_document_length": 33, "unique_documents": 2, - "min_query_length": 27, - "average_query_length": 30.0, - "max_query_length": 33, + "num_queries": 2, + "min_query_length": 23, + "average_query_length": 26.0, + "max_query_length": 29, "unique_queries": 2, + "none_queries": 0, + "num_relevant_docs": 4, "min_relevant_docs_per_query": 2, - "average_relevant_docs_per_query": 2.0, + "average_relevant_docs_per_query": 1.0, "max_relevant_docs_per_query": 2, "unique_relevant_docs": 2, + "num_instructions": None, + "min_instruction_length": None, + "average_instruction_length": None, + "max_instruction_length": None, + "unique_instructions": None, + "num_top_ranked": None, + "min_top_ranked_per_query": None, + "average_top_ranked_per_query": None, + "max_top_ranked_per_query": None, }, }, }, From 788f54e5b3edff7cd4497d7babef6c3161bd5167 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sat, 21 Dec 2024 14:55:35 +0300 Subject: [PATCH 29/31] fix tests --- mteb/evaluation/evaluators/model_classes.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mteb/evaluation/evaluators/model_classes.py b/mteb/evaluation/evaluators/model_classes.py index 19c5121c9d..ba1ac21937 100644 --- a/mteb/evaluation/evaluators/model_classes.py +++ b/mteb/evaluation/evaluators/model_classes.py @@ -512,7 +512,6 @@ def encode_corpus( self, corpus: list[dict[str, str]], task_name: str, - batch_size: int, prompt_type: PromptType = PromptType.passage, **kwargs, ): @@ -521,7 +520,6 @@ def encode_corpus( sentences, task_name=task_name, prompt_type=prompt_type, - batch_size=batch_size, **kwargs, ) return corpus_embeddings From 06017efdab72362394f5e6ebbba8c575c9a22403 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sat, 21 Dec 2024 16:08:26 +0300 Subject: [PATCH 30/31] fix output path for retrieval --- mteb/evaluation/MTEB.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index faad939015..4b1e06e077 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -476,6 +476,7 @@ def run( save_path = None if output_path: + kwargs["output_folder"] = output_folder # needed for retrieval tasks save_path = output_path / f"{task.metadata.name}{task.save_suffix}.json" if save_path.exists(): existing_results = TaskResult.from_disk(save_path) From 7144fcaf89d509fbcdd670404ed51ea193ea3607 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sat, 21 Dec 2024 22:25:34 +0300 Subject: [PATCH 31/31] fix similarity function --- mteb/evaluation/evaluators/model_classes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mteb/evaluation/evaluators/model_classes.py b/mteb/evaluation/evaluators/model_classes.py index ba1ac21937..7e66f22e65 100644 --- a/mteb/evaluation/evaluators/model_classes.py +++ b/mteb/evaluation/evaluators/model_classes.py @@ -239,7 +239,7 @@ def _rerank_documents( query_embedding = query_embeddings[query_idx].unsqueeze(0) score_function = ( - self.model.score if hasattr(self.model, "score") else cos_sim + self.model.similarity if hasattr(self.model, "similarity") else cos_sim ) with torch.inference_mode(): @@ -322,7 +322,7 @@ def _full_corpus_search( sub_corpus_embeddings = torch.as_tensor(sub_corpus_embeddings).to(device) score_function = ( - self.model.score if hasattr(self.model, "score") else cos_sim + self.model.similarity if hasattr(self.model, "similarity") else cos_sim ) with torch.inference_mode():