diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 33b91f2301..eee55aa3fa 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -31,7 +31,7 @@ jobs: - name: Python Semantic Release id: release - uses: python-semantic-release/python-semantic-release@v8.0.4 + uses: python-semantic-release/python-semantic-release@v9.8.9 with: github_token: ${{ secrets.RELEASE }} changelog: "false" @@ -44,7 +44,7 @@ jobs: # publisher must already be configured on PyPI. - name: Publish package distributions to GitHub Releases - uses: python-semantic-release/upload-to-gh-release@main + uses: python-semantic-release/upload-to-gh-release@v9.8.9 if: steps.release.outputs.released == 'true' with: github_token: ${{ secrets.RELEASE }} diff --git a/.github/workflows/stale_pr.yml b/.github/workflows/stale_pr.yml new file mode 100644 index 0000000000..3328484f52 --- /dev/null +++ b/.github/workflows/stale_pr.yml @@ -0,0 +1,21 @@ +name: Label and Close stale PRs +on: + schedule: + - cron: '30 1 * * *' # runs daily at 1:30 AM +jobs: + stale: + runs-on: ubuntu-latest + permissions: + actions: write + issues: write + pull-requests: write + steps: + - uses: actions/stale@v9.1.0 + with: + stale-pr-label: 'stale' + stale-issue-label: 'stale' + days-before-pr-stale: 14 + days-before-pr-close: 7 + any-of-issue-labels: 'more information needed' + stale-pr-message: 'This pull request has been automatically marked as stale due to inactivity.' + close-pr-message: 'This pull request has been automatically closed due to inactivity.' diff --git a/Makefile b/Makefile index d0f857c9f4..9428427158 100644 --- a/Makefile +++ b/Makefile @@ -50,7 +50,7 @@ model-load-test: dataset-load-test: @echo "--- 🚀 Running dataset load test ---" - pytest -n auto -m test_datasets + pytest -m test_datasets leaderboard-build-test: @echo "--- 🚀 Running leaderboard build test ---" diff --git a/docs/adding_a_dataset.md b/docs/adding_a_dataset.md index 2f34b271b6..6df514c514 100644 --- a/docs/adding_a_dataset.md +++ b/docs/adding_a_dataset.md @@ -242,14 +242,13 @@ Before you commit, here is a checklist you should complete before submitting: An easy way to test it is using: ```python -from mteb import MTEB -from sentence_transformers import SentenceTransformer +import mteb +# sample model: +model = mteb.get_model("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") -# Define the sentence-transformers model name -model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" - -model = SentenceTransformer(model_name) -evaluation = MTEB(tasks=[YourNewTask()]) +task = mteb.get_task("{name of your task}") +evaluation = mteb.MTEB(tasks=[task]) +evaluation.run(model) ``` - [ ] I have run the following models on the task (adding the results to the pr). These can be run using the `mteb run -m {model_name} -t {task_name}` command. diff --git a/docs/adding_a_model.md b/docs/adding_a_model.md index 256eaee6d5..7411ba6688 100644 --- a/docs/adding_a_model.md +++ b/docs/adding_a_model.md @@ -64,7 +64,7 @@ model = ModelMeta( revision="fd1525a9fd15316a2d503bf26ab031a61d056e98", model_prompts={ "query": "query: ", - "passage": "passage: ", + "document": "passage: ", }, ), ) diff --git a/docs/benchmarks.md b/docs/benchmarks.md index a57dd3376d..032113c401 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -8,7 +8,7 @@ The following table gives you an overview of the benchmarks in MTEB. | Name | Leaderboard name | # Tasks | Task Types | Domains | Languages | |------|------------------|---------|------------|---------|-----------| -| [BEIR](https://arxiv.org/abs/2104.08663) | BEIR | 15 | Retrieval: 15 | [Academic, Blog, Encyclopaedic, Financial, Government, Medical, News, Non-fiction, Programming, Reviews, Social, Web, Written] | eng | +| [BEIR](https://arxiv.org/abs/2104.08663) | BEIR | 15 | Retrieval: 15 | [Academic, Blog, Encyclopaedic, Financial, Government, Medical, News, Non-fiction, Programming, Reviews, Social, Web, Written] | eng,vie | | [BEIR-NL](https://arxiv.org/abs/2412.08329) | BEIR-NL | 15 | Retrieval: 15 | [Academic, Encyclopaedic, Medical, Non-fiction, Web, Written] | nld | | [BRIGHT](https://brightbenchmark.github.io/) | BRIGHT | 1 | Retrieval: 1 | [Non-fiction, Written] | eng | | [BRIGHT (long)](https://brightbenchmark.github.io/) | BRIGHT (long) | 1 | Retrieval: 1 | [Non-fiction, Written] | eng | @@ -18,6 +18,7 @@ The following table gives you an overview of the benchmarks in MTEB. | [CodeRAG](https://arxiv.org/abs/2406.14497) | CodeRAG | 4 | Reranking: 4 | [Programming] | python | | [Encodechka](https://github.com/avidale/encodechka) | Encodechka | 7 | STS: 2, Classification: 4, PairClassification: 1 | [Fiction, Government, News, Non-fiction, Social, Web, Written] | rus | | [FollowIR](https://arxiv.org/abs/2403.15246) | Instruction Following | 3 | InstructionRetrieval: 3 | [News, Written] | eng | +| [JinaVDR](https://arxiv.org/abs/2506.18902) | Jina Visual Document Retrieval | 43 | DocumentUnderstanding: 43 | [Academic, Engineering, Government, Legal, Medical, News, Social, Web] | ara,ben,deu,eng,fra,hin,hun,ind,ita,jpn,kor,mya,nld,por,rus,spa,tha,urd,vie,zho | | [LongEmbed](https://arxiv.org/abs/2404.12096v2) | Long-context Retrieval | 6 | Retrieval: 6 | [Academic, Blog, Encyclopaedic, Fiction, Non-fiction, Spoken, Written] | eng | | [MIEB(Img)](https://arxiv.org/abs/2504.10471) | Image only | 49 | Any2AnyRetrieval: 15, ImageClassification: 22, ImageClustering: 5, VisualSTS(eng): 5, VisualSTS(multi): 2 | [Blog, Encyclopaedic, Medical, News, Non-fiction, Reviews, Scene, Social, Spoken, Web, Written] | ara,cmn,deu,eng,fra,ita,kor,nld,pol,por,rus,spa,tur | | [MIEB(Multilingual)](https://arxiv.org/abs/2504.10471) | Image-Text, Multilingual | 130 | ImageClassification: 22, ImageClustering: 5, ZeroShotClassification: 23, VisionCentricQA: 6, Compositionality: 7, VisualSTS(eng): 7, Any2AnyRetrieval: 45, DocumentUnderstanding: 10, Any2AnyMultilingualRetrieval: 3, VisualSTS(multi): 2 | [Academic, Blog, Constructed, Encyclopaedic, Medical, News, Non-fiction, Reviews, Scene, Social, Spoken, Web, Written] | ara,ben,bul,ces,cmn,dan,deu,ell,eng,est,fas,fil,fin,fra,heb,hin,hrv,hun,ind,ita,jpn,kor,mri,nld,nor,pol,por,quz,ron,rus,spa,swa,swe,tel,tha,tur,ukr,vie,zho | @@ -33,7 +34,7 @@ The following table gives you an overview of the benchmarks in MTEB. | [MTEB(Scandinavian, v1)](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/) | Scandinavian | 28 | BitextMining: 2, Classification: 13, Retrieval: 7, Clustering: 6 | [Blog, Encyclopaedic, Fiction, Government, Legal, News, Non-fiction, Reviews, Social, Spoken, Web, Written] | dan,fao,isl,nno,nob,swe | | [MTEB(cmn, v1)](https://github.com/FlagOpen/FlagEmbedding/tree/master/research/C_MTEB) | Chinese | 32 | Retrieval: 8, Reranking: 4, PairClassification: 2, Clustering: 4, STS: 7, Classification: 7 | [Academic, Entertainment, Financial, Government, Medical, Non-fiction, Written] | cmn | | [MTEB(deu, v1)](https://arxiv.org/html/2401.02709v1) | German | 19 | Classification: 6, Clustering: 4, PairClassification: 2, Reranking: 1, Retrieval: 4, STS: 2 | [Encyclopaedic, Legal, News, Non-fiction, Reviews, Spoken, Web, Written] | deu | -| MTEB(eng, v1) | English Legacy | 56 | Classification: 12, Retrieval: 15, Clustering: 11, Reranking: 4, STS: 10, PairClassification: 3, Summarization: 1 | [Academic, Blog, Encyclopaedic, Financial, Government, Medical, News, Non-fiction, Programming, Reviews, Social, Spoken, Web, Written] | eng | +| MTEB(eng, v1) | English Legacy | 56 | Classification: 12, Retrieval: 15, Clustering: 11, Reranking: 4, STS: 10, PairClassification: 3, Summarization: 1 | [Academic, Blog, Encyclopaedic, Financial, Government, Medical, News, Non-fiction, Programming, Reviews, Social, Spoken, Web, Written] | eng,vie | | MTEB(eng, v2) | English | 41 | Retrieval: 10, Clustering: 8, Reranking: 2, STS: 9, Classification: 8, PairClassification: 3, Summarization: 1 | [Academic, Blog, Encyclopaedic, Financial, Medical, News, Non-fiction, Programming, Reviews, Social, Spoken, Web, Written] | eng | | [MTEB(fas, v1)](https://arxiv.org/abs/2502.11571) | Farsi | 60 | Classification: 18, Clustering: 5, PairClassification: 8, Reranking: 2, Retrieval: 21, STS: 3, BitextMining: 3 | [Academic, Blog, Encyclopaedic, Medical, News, Religious, Reviews, Social, Spoken, Web, Written] | fas | | [MTEB(fra, v1)](https://arxiv.org/abs/2405.20468) | French | 25 | Classification: 6, Clustering: 7, PairClassification: 1, Reranking: 2, Retrieval: 5, STS: 3, Summarization: 1 | [Academic, Encyclopaedic, Legal, News, Non-fiction, Reviews, Social, Spoken, Web, Written] | eng,fra | @@ -44,6 +45,8 @@ The following table gives you an overview of the benchmarks in MTEB. | [NanoBEIR](https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6) | NanoBEIR | 13 | Retrieval: 13 | [Academic, Encyclopaedic, Medical, News, Non-fiction, Social, Web, Written] | eng | | [R2MED](https://r2med.github.io/) | Reasoning-driven medical retrieval | 8 | Retrieval: 8 | [Medical] | eng | | [RAR-b](https://arxiv.org/abs/2404.06347) | Reasoning retrieval | 17 | Retrieval: 17 | [Encyclopaedic, Programming, Written] | eng | +| [RuSciBench](https://link.springer.com/article/10.1134/S1064562424602191) | RuSciBench | 9 | BitextMining: 1, Classification: 4, Retrieval: 2, Regression: 2 | [Academic, Non-fiction, Written] | eng,rus | +| [VN-MTEB (vie, v1)](https://arxiv.org/abs/2507.21500) | Vietnamese | 50 | Retrieval: 24, Classification: 12, PairClassification: 3, Clustering: 5, Reranking: 3, STS: 3 | [Academic, Blog, Encyclopaedic, Financial, Government, Medical, News, Non-fiction, Programming, Reviews, Social, Spoken, Web, Written] | vie | | [ViDoRe(v1)](https://arxiv.org/abs/2407.01449) | ViDoRe(v1) | 10 | DocumentUnderstanding: 10 | [Academic] | eng | | [ViDoRe(v2)](https://arxiv.org/abs/2407.01449) | ViDoRe(v2) | 4 | DocumentUnderstanding: 4 | [Academic] | deu,eng,fra,spa | | [VisualDocumentRetrieval](https://arxiv.org/abs/2407.01449) | Visual Document Retrieval | 14 | DocumentUnderstanding: 14 | [Academic] | deu,eng,fra,spa | diff --git a/docs/tasks.md b/docs/tasks.md index bcce70245f..2828f07694 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -20,16 +20,19 @@ The following tables give you an overview of the tasks in MTEB. | [ATEC](https://aclanthology.org/2021.emnlp-main.357) (Raghu et al., 2021) | ['cmn'] | STS | s2s | | None | None | | [AfriSentiClassification](https://arxiv.org/abs/2302.08956) (Shamsuddeen Hassan Muhammad, 2023) | ['amh', 'arq', 'ary', 'hau', 'ibo', 'kin', 'pcm', 'por', 'swa', 'tso', 'twi', 'yor'] | Classification | s2s | [Social, Written] | None | None | | [AfriSentiLangClassification](https://huggingface.co/datasets/HausaNLP/afrisenti-lid-data/) | ['amh', 'arq', 'ary', 'hau', 'ibo', 'kin', 'pcm', 'por', 'swa', 'tso', 'twi', 'yor'] | Classification | s2s | [Social, Written] | None | None | -| [AllegroReviews](https://aclanthology.org/2020.acl-main.111.pdf) (Rybak et al., 2020) | ['pol'] | Classification | s2s | [Reviews] | None | None | +| [AllegroReviews.v2](https://aclanthology.org/2020.acl-main.111.pdf) (Rybak et al., 2020) | ['pol'] | Classification | s2s | [Reviews] | None | None | | [AlloProfClusteringP2P.v2](https://huggingface.co/datasets/lyon-nlp/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Clustering | p2p | [Encyclopaedic, Written] | None | None | | [AlloProfClusteringS2S.v2](https://huggingface.co/datasets/lyon-nlp/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Clustering | s2s | [Encyclopaedic, Written] | None | None | | [AlloprofReranking](https://huggingface.co/datasets/antoinelb7/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Reranking | s2p | [Academic, Web, Written] | None | None | | [AlloprofRetrieval](https://huggingface.co/datasets/antoinelb7/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [AlphaNLI](https://leaderboard.allenai.org/anli/submissions/get-started) (Bhagavatula et al., 2019) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [AmazonCounterfactualClassification](https://arxiv.org/abs/2104.06893) (O{', 2021) | ['deu', 'eng', 'jpn'] | Classification | s2s | [Reviews, Written] | None | None | -| [AmazonPolarityClassification](https://huggingface.co/datasets/amazon_polarity) (Julian McAuley, 2013) | ['eng'] | Classification | p2p | [Reviews, Written] | None | None | +| [AmazonCounterfactualVNClassification](https://arxiv.org/abs/2104.06893) (Loc Pham, 2025) | ['vie'] | Classification | s2s | [Reviews, Written] | None | None | +| [AmazonPolarityClassification.v2](https://huggingface.co/datasets/amazon_polarity) (Julian McAuley, 2013) | ['eng'] | Classification | p2p | [Reviews, Written] | None | None | +| [AmazonPolarityVNClassification](https://huggingface.co/datasets/amazon_polarity) (Loc Pham, 2025) | ['vie'] | Classification | s2s | [Reviews, Written] | None | None | | [AmazonReviewsClassification](https://arxiv.org/abs/2010.02573) (Phillip Keung, 2020) | ['cmn', 'deu', 'eng', 'fra', 'jpn', 'spa'] | Classification | s2s | [Reviews, Written] | None | None | -| [AngryTweetsClassification](https://aclanthology.org/2021.nodalida-main.53/) (Pauli et al., 2021) | ['dan'] | Classification | s2s | [Social, Written] | None | None | +| [AmazonReviewsVNClassification](https://arxiv.org/abs/2010.02573) (Loc Pham, 2025) | ['vie'] | Classification | s2s | [Reviews, Written] | None | None | +| [AngryTweetsClassification.v2](https://aclanthology.org/2021.nodalida-main.53/) (Pauli et al., 2021) | ['dan'] | Classification | s2s | [Social, Written] | None | None | | [AppsRetrieval](https://arxiv.org/abs/2105.09938) (Dan Hendrycks, 2021) | ['eng', 'python'] | Retrieval | p2p | [Programming, Written] | {'test': 12530} | {'test': {'number_of_characters': 11335620, 'num_samples': 12530, 'num_queries': 3765, 'num_documents': 8765, 'min_document_length': 152, 'average_document_length': 717.27, 'max_document_length': 5742, 'unique_documents': 8765, 'min_query_length': 6, 'average_query_length': 1340.96, 'max_query_length': 289049, 'unique_queries': 3765, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3765}} | | [ArEntail](https://link.springer.com/article/10.1007/s10579-024-09731-1) (Obeidat et al., 2024) | ['ara'] | PairClassification | s2s | [News, Written] | None | None | | [ArXivHierarchicalClusteringP2P](https://www.kaggle.com/Cornell-University/arxiv) | ['eng'] | Clustering | p2p | [Academic, Written] | {'test': 2048} | {'test': {'num_samples': 2048, 'number_of_characters': 2065284, 'min_text_length': 103, 'average_text_length': 1008.44, 'max_text_length': 2103, 'min_labels_per_text': 1, 'average_labels_per_text': 1.46, 'max_labels_per_text': 381, 'unique_labels': 129, 'labels': {'cs': {'count': 356}, 'math': {'count': 381}, 'OC': {'count': 11}, 'hep-lat': {'count': 13}, 'hep': {'count': 98}, 'astro-ph': {'count': 213}, 'eess': {'count': 76}, 'quant-ph': {'count': 135}, 'DC': {'count': 5}, 'cond-mat': {'count': 274}, 'hep-th': {'count': 66}, 'SP': {'count': 33}, 'hep-ph': {'count': 69}, 'FA': {'count': 6}, 'nucl-th': {'count': 17}, 'q-bio': {'count': 80}, 'HE': {'count': 22}, 'HC': {'count': 2}, 'stat': {'count': 60}, 'ML': {'count': 16}, 'IV': {'count': 13}, 'stat-mech': {'count': 47}, 'DS': {'count': 14}, 'ME': {'count': 12}, 'CC': {'count': 2}, 'mtrl-sci': {'count': 22}, 'PE': {'count': 16}, 'NT': {'count': 11}, 'SC': {'count': 6}, 'AG': {'count': 13}, 'physics': {'count': 81}, 'ins-det': {'count': 9}, 'GA': {'count': 18}, 'BM': {'count': 6}, 'GN': {'count': 17}, 'NA': {'count': 15}, 'app-ph': {'count': 7}, 'RT': {'count': 6}, 'other': {'count': 37}, 'soft': {'count': 15}, 'CO': {'count': 33}, 'supr-con': {'count': 21}, 'chem-ph': {'count': 3}, 'DM': {'count': 2}, 'MN': {'count': 12}, 'q-fin': {'count': 27}, 'PM': {'count': 2}, 'AP': {'count': 27}, 'gr-qc': {'count': 15}, 'quant-gas': {'count': 8}, 'mes-hall': {'count': 33}, 'IT': {'count': 19}, 'SI': {'count': 6}, 'SG': {'count': 3}, 'bio-ph': {'count': 2}, 'SR': {'count': 16}, 'soc-ph': {'count': 5}, 'hep-ex': {'count': 15}, 'DG': {'count': 11}, 'NE': {'count': 5}, 'CR': {'count': 6}, 'CL': {'count': 12}, 'RM': {'count': 3}, 'econ': {'count': 17}, 'nlin': {'count': 5}, 'PS': {'count': 1}, 'LG': {'count': 26}, 'QA': {'count': 9}, 'str-el': {'count': 26}, 'CV': {'count': 34}, 'MF': {'count': 6}, 'IM': {'count': 7}, 'EM': {'count': 6}, 'TH': {'count': 5}, 'PR': {'count': 20}, 'AT': {'count': 4}, 'OA': {'count': 4}, 'CP': {'count': 6}, 'LO': {'count': 14}, 'flu-dyn': {'count': 6}, 'atom-ph': {'count': 8}, 'class-ph': {'count': 1}, 'SY': {'count': 20}, 'IR': {'count': 1}, 'plasm-ph': {'count': 8}, 'CE': {'count': 2}, 'AO': {'count': 1}, 'comp-ph': {'count': 3}, 'optics': {'count': 12}, 'MG': {'count': 4}, 'ST': {'count': 6}, 'nucl-ex': {'count': 6}, 'CY': {'count': 9}, 'ao-ph': {'count': 2}, 'DB': {'count': 1}, 'math-ph': {'count': 10}, 'NC': {'count': 13}, 'GT': {'count': 11}, 'TO': {'count': 2}, 'AI': {'count': 9}, 'NI': {'count': 2}, 'gen-ph': {'count': 4}, 'OT': {'count': 4}, 'SD': {'count': 2}, 'dis-nn': {'count': 4}, 'RO': {'count': 7}, 'CA': {'count': 6}, 'FL': {'count': 1}, 'SE': {'count': 5}, 'EP': {'count': 9}, 'hist-ph': {'count': 1}, 'QM': {'count': 9}, 'ed-ph': {'count': 2}, 'GR': {'count': 4}, 'MS': {'count': 1}, 'CD': {'count': 1}, 'ET': {'count': 1}, 'acc-ph': {'count': 5}, 'AC': {'count': 2}, 'OH': {'count': 1}, 'EC': {'count': 2}, 'DL': {'count': 1}, 'AS': {'count': 3}, 'geo-ph': {'count': 2}, 'CG': {'count': 3}, 'CB': {'count': 1}, 'AR': {'count': 1}, 'TR': {'count': 1}, 'atm-clus': {'count': 1}}}} | @@ -38,21 +41,26 @@ The following tables give you an overview of the tasks in MTEB. | [ArguAna-Fa](https://huggingface.co/datasets/MCINext/arguana-fa) | ['fas'] | Retrieval | s2p | [Blog] | None | None | | [ArguAna-NL](https://huggingface.co/datasets/clips/beir-nl-arguana) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Non-fiction, Written] | None | None | | [ArguAna-PL](https://huggingface.co/datasets/clarin-knext/arguana-pl) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Medical, Written] | None | None | +| [ArguAna-VN](http://argumentation.bplaced.net/arguana/data) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Medical, Written] | None | None | | [ArmenianParaphrasePC](https://github.com/ivannikov-lab/arpa-paraphrase-corpus) (Arthur Malajyan, 2020) | ['hye'] | PairClassification | s2s | [News, Written] | None | None | -| [ArxivClassification](https://ieeexplore.ieee.org/document/8675939) (He et al., 2019) | ['eng'] | Classification | s2s | [Academic, Written] | None | None | +| [ArxivClassification.v2](https://ieeexplore.ieee.org/document/8675939) (He et al., 2019) | ['eng'] | Classification | s2s | [Academic, Written] | None | None | | [AskUbuntuDupQuestions](https://github.com/taolei87/askubuntu) (Wang et al., 2021) | ['eng'] | Reranking | s2s | [Programming, Web] | {'test': 375} | {'test': {'num_samples': 375, 'number_of_characters': 413674, 'num_positive': 2255, 'num_negative': 5245, 'min_query_length': 17, 'avg_query_length': 50.21, 'max_query_length': 148, 'unique_query': 374, 'min_positive_length': 15, 'avg_positive_length': 52.54, 'max_positive_length': 152, 'unique_positive': 2165, 'min_negative_length': 15, 'avg_negative_length': 52.69, 'max_negative_length': 148, 'unique_negative': 5002}} | +| [AskUbuntuDupQuestions-VN](https://github.com/taolei87/askubuntu) (Loc Pham, 2025) | ['vie'] | Reranking | s2s | [Programming, Web] | None | None | | [Assin2RTE](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) (Real et al., 2020) | ['por'] | PairClassification | s2s | [Written] | None | None | | [Assin2STS](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) (Real et al., 2020) | ['por'] | STS | s2s | [Written] | None | None | | [AutoRAGRetrieval](https://arxiv.org/abs/2410.20878) (Dongkyu Kim, 2024) | ['kor'] | Retrieval | s2p | [Financial, Government, Legal, Medical, Social] | {'test': 834} | {'test': {'number_of_characters': 894.22, 'num_samples': 834, 'num_queries': 114, 'num_documents': 720, 'average_document_length': 1.15, 'average_query_length': 0.61, 'average_relevant_docs_per_query': 1.0}} | | [BIOSSES](https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html) (Soğancıoğlu et al., 2017) | ['eng'] | STS | s2s | [Medical] | None | None | +| [BIOSSES-VN](https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html) (Loc Pham, 2025) | ['vie'] | STS | s2s | [Medical] | None | None | | [BLINKIT2IMultiChoice](https://arxiv.org/abs/2404.12390) (Fu et al., 2024) | ['eng'] | VisionCentricQA | it2i | [Encyclopaedic] | {'test': 1206} | {'test': {'number_of_characters': 21204, 'num_samples': 1206, 'num_queries': 402, 'num_documents': 804, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 83, 'average_document_image_width': 788.44, 'max_document_image_width': 5087, 'min_document_image_height': 127, 'average_document_image_height': 813.95, 'max_document_image_height': 3230, 'num_document_images': 804, 'min_query_length': 51, 'average_query_length': 52.75, 'max_query_length': 57, 'unique_queries': 3, 'num_query_images': 402, 'min_query_image_width': 166, 'average_query_image_width': 815.13, 'max_query_image_width': 2733, 'min_query_image_height': 254, 'average_query_image_height': 875.38, 'max_query_image_height': 5687, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 804}} | | [BLINKIT2IRetrieval](https://arxiv.org/abs/2404.12390) (Fu et al., 2024) | ['eng'] | Any2AnyRetrieval | it2i | [Encyclopaedic] | {'test': 1206} | {'test': {'number_of_characters': 21204, 'num_samples': 1206, 'num_queries': 402, 'num_documents': 804, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 804, 'min_query_length': 51, 'average_query_length': 52.75, 'max_query_length': 57, 'unique_queries': 3, 'num_query_images': 402, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 402}} | | [BLINKIT2TMultiChoice](https://arxiv.org/abs/2404.12390) (Fu et al., 2024) | ['eng'] | VisionCentricQA | it2t | [Encyclopaedic] | {'test': 813} | {'test': {'number_of_characters': 54272, 'num_samples': 813, 'num_queries': 793, 'num_documents': 20, 'min_document_length': 1, 'average_document_length': 5.8, 'max_document_length': 14, 'unique_documents': 20, 'min_document_image_width': 0, 'average_document_image_width': 0, 'max_document_image_width': 0, 'min_document_image_height': 0, 'average_document_image_height': 0, 'max_document_image_height': 0, 'num_document_images': 0, 'min_query_length': 22, 'average_query_length': 68.29, 'max_query_length': 135, 'unique_queries': 347, 'num_query_images': 793, 'min_query_image_width': 63, 'average_query_image_width': 515.08, 'max_query_image_width': 4096, 'min_query_image_height': 232, 'average_query_image_height': 722.64, 'max_query_image_height': 3226, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 20}} | | [BLINKIT2TRetrieval](https://arxiv.org/abs/2404.12390) (Fu et al., 2024) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | {'test': 813} | {'test': {'number_of_characters': 54272, 'num_samples': 813, 'num_queries': 793, 'num_documents': 20, 'min_document_length': 1, 'average_document_length': 5.8, 'max_document_length': 14, 'unique_documents': 20, 'num_document_images': 0, 'min_query_length': 22, 'average_query_length': 68.29, 'max_query_length': 135, 'unique_queries': 347, 'num_query_images': 793, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 20}} | | [BQ](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None | -| [BSARDRetrieval](https://huggingface.co/datasets/maastrichtlawtech/bsard) (Louis et al., 2022) | ['fra'] | Retrieval | s2p | [Legal, Spoken] | None | None | +| [BSARDRetrieval.v2](https://huggingface.co/datasets/maastrichtlawtech/bsard) (Louis et al., 2022) | ['fra'] | Retrieval | s2p | [Legal, Spoken] | None | None | | [BUCC.v2](https://comparable.limsi.fr/bucc2018/bucc2018-task.html) (Zweigenbaum et al., 2017) | ['cmn', 'deu', 'eng', 'fra', 'rus'] | BitextMining | s2s | [Written] | {'test': 35000} | {'test': {'num_samples': 35000, 'number_of_characters': 6640032, 'unique_pairs': 34978, 'min_sentence1_length': 16, 'average_sentence1_length': 99.11, 'max_sentence1_length': 204, 'unique_sentence1': 34978, 'min_sentence2_length': 42, 'average_sentence2_length': 90.61, 'max_sentence2_length': 159, 'unique_sentence2': 25306, 'hf_subset_descriptive_stats': {'de-en': {'num_samples': 9580, 'number_of_characters': 1919197, 'unique_pairs': 9573, 'min_sentence1_length': 50, 'average_sentence1_length': 109.08, 'max_sentence1_length': 204, 'unique_sentence1': 9573, 'min_sentence2_length': 46, 'average_sentence2_length': 91.25, 'max_sentence2_length': 155, 'unique_sentence2': 9570}, 'fr-en': {'num_samples': 9086, 'number_of_characters': 1677545, 'unique_pairs': 9081, 'min_sentence1_length': 43, 'average_sentence1_length': 99.32, 'max_sentence1_length': 174, 'unique_sentence1': 9081, 'min_sentence2_length': 42, 'average_sentence2_length': 85.31, 'max_sentence2_length': 159, 'unique_sentence2': 9076}, 'ru-en': {'num_samples': 14435, 'number_of_characters': 2808206, 'unique_pairs': 14425, 'min_sentence1_length': 40, 'average_sentence1_length': 101.66, 'max_sentence1_length': 186, 'unique_sentence1': 14425, 'min_sentence2_length': 45, 'average_sentence2_length': 92.88, 'max_sentence2_length': 159, 'unique_sentence2': 14424}, 'zh-en': {'num_samples': 1899, 'number_of_characters': 235084, 'unique_pairs': 1899, 'min_sentence1_length': 16, 'average_sentence1_length': 28.43, 'max_sentence1_length': 40, 'unique_sentence1': 1899, 'min_sentence2_length': 48, 'average_sentence2_length': 95.36, 'max_sentence2_length': 159, 'unique_sentence2': 1899}}}} | -| [Banking77Classification](https://arxiv.org/abs/2003.04807) (Casanueva et al., 2020) | ['eng'] | Classification | s2s | [Written] | None | None | +| [Banking77Classification.v2](https://arxiv.org/abs/2003.04807) (Casanueva et al., 2020) | ['eng'] | Classification | s2s | [Written] | None | None | +| [Banking77VNClassification](https://arxiv.org/abs/2003.04807) (Loc Pham, 2025) | ['vie'] | Classification | s2s | [Written] | None | None | +| [BarExamQA](https://huggingface.co/datasets/reglab/barexam_qa) (Zheng et al., 2025) | ['eng'] | Retrieval | t2t | [Academic, Legal] | None | None | | [BelebeleRetrieval](https://arxiv.org/abs/2308.16884) (Lucas Bandarkar, 2023) | ['acm', 'afr', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'azj', 'bam', 'ben', 'bod', 'bul', 'cat', 'ceb', 'ces', 'ckb', 'dan', 'deu', 'ell', 'eng', 'est', 'eus', 'fin', 'fra', 'fuv', 'gaz', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kac', 'kan', 'kat', 'kaz', 'kea', 'khk', 'khm', 'kin', 'kir', 'kor', 'lao', 'lin', 'lit', 'lug', 'luo', 'lvs', 'mal', 'mar', 'mkd', 'mlt', 'mri', 'mya', 'nld', 'nob', 'npi', 'nso', 'nya', 'ory', 'pan', 'pbt', 'pes', 'plt', 'pol', 'por', 'ron', 'rus', 'shn', 'sin', 'slk', 'slv', 'sna', 'snd', 'som', 'sot', 'spa', 'srp', 'ssw', 'sun', 'swe', 'swh', 'tam', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tsn', 'tso', 'tur', 'ukr', 'urd', 'uzn', 'vie', 'war', 'wol', 'xho', 'yor', 'zho', 'zsm', 'zul'] | Retrieval | s2p | [News, Web, Written] | {'test': 521866} | {'test': {'number_of_characters': 25574620, 'num_samples': 521866, 'num_queries': 338378, 'num_documents': 183488, 'min_document_length': 4, 'average_document_length': 137.38, 'max_document_length': 237, 'unique_documents': 183488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 338378, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 183488, 'hf_subset_descriptive_stats': {'acm_Arab-acm_Arab': {'number_of_characters': 51232, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 102.98, 'max_document_length': 129, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'acm_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-acm_Arab': {'number_of_characters': 51232, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 102.98, 'max_document_length': 129, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'afr_Latn-afr_Latn': {'number_of_characters': 71217, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 143.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'afr_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-afr_Latn': {'number_of_characters': 71217, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 143.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'als_Latn-als_Latn': {'number_of_characters': 69498, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 140.41, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'als_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-als_Latn': {'number_of_characters': 69498, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 140.41, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'amh_Ethi-amh_Ethi': {'number_of_characters': 45221, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 90.67, 'max_document_length': 100, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'amh_Ethi-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-amh_Ethi': {'number_of_characters': 45221, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 90.67, 'max_document_length': 100, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'apc_Arab-apc_Arab': {'number_of_characters': 51248, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 103.02, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'apc_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-apc_Arab': {'number_of_characters': 51248, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 103.02, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Arab-arb_Arab': {'number_of_characters': 53671, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 107.98, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-arb_Arab': {'number_of_characters': 53671, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 107.98, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Latn-arb_Latn': {'number_of_characters': 61298, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 123.61, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-arb_Latn': {'number_of_characters': 61298, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 123.61, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ars_Arab-ars_Arab': {'number_of_characters': 51765, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 104.08, 'max_document_length': 119, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ars_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ars_Arab': {'number_of_characters': 51765, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 104.08, 'max_document_length': 119, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ary_Arab-ary_Arab': {'number_of_characters': 60261, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 121.49, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ary_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ary_Arab': {'number_of_characters': 60261, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 121.49, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arz_Arab-arz_Arab': {'number_of_characters': 52403, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 105.38, 'max_document_length': 115, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arz_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-arz_Arab': {'number_of_characters': 52403, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 105.38, 'max_document_length': 115, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'asm_Beng-asm_Beng': {'number_of_characters': 62410, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 4, 'average_document_length': 125.89, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'asm_Beng-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-asm_Beng': {'number_of_characters': 62410, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 4, 'average_document_length': 125.89, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'azj_Latn-azj_Latn': {'number_of_characters': 67137, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.58, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'azj_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-azj_Latn': {'number_of_characters': 67137, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.58, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bam_Latn-bam_Latn': {'number_of_characters': 66084, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 133.42, 'max_document_length': 166, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bam_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-bam_Latn': {'number_of_characters': 66084, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 133.42, 'max_document_length': 166, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Beng-ben_Beng': {'number_of_characters': 63512, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 128.15, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Beng-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ben_Beng': {'number_of_characters': 63512, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 128.15, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Latn-ben_Latn': {'number_of_characters': 68285, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 137.93, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ben_Latn': {'number_of_characters': 68285, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 137.93, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bod_Tibt-bod_Tibt': {'number_of_characters': 79188, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.27, 'max_document_length': 213, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bod_Tibt-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-bod_Tibt': {'number_of_characters': 79188, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.27, 'max_document_length': 213, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bul_Cyrl-bul_Cyrl': {'number_of_characters': 66577, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.43, 'max_document_length': 177, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bul_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-bul_Cyrl': {'number_of_characters': 66577, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.43, 'max_document_length': 177, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'cat_Latn-cat_Latn': {'number_of_characters': 68842, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.07, 'max_document_length': 163, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'cat_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-cat_Latn': {'number_of_characters': 68842, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.07, 'max_document_length': 163, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ceb_Latn-ceb_Latn': {'number_of_characters': 74053, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 149.75, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ceb_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ceb_Latn': {'number_of_characters': 74053, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 149.75, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ces_Latn-ces_Latn': {'number_of_characters': 61936, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 124.92, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ces_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ces_Latn': {'number_of_characters': 61936, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 124.92, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ckb_Arab-ckb_Arab': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 131.03, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ckb_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ckb_Arab': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 131.03, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'dan_Latn-dan_Latn': {'number_of_characters': 66648, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.57, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'dan_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-dan_Latn': {'number_of_characters': 66648, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.57, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'deu_Latn-deu_Latn': {'number_of_characters': 68768, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 138.92, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'deu_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-deu_Latn': {'number_of_characters': 68768, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 138.92, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ell_Grek-ell_Grek': {'number_of_characters': 79210, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.32, 'max_document_length': 212, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ell_Grek-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ell_Grek': {'number_of_characters': 79210, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.32, 'max_document_length': 212, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'est_Latn-est_Latn': {'number_of_characters': 61779, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.6, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'est_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-est_Latn': {'number_of_characters': 61779, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.6, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eus_Latn-eus_Latn': {'number_of_characters': 67979, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 137.3, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eus_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-eus_Latn': {'number_of_characters': 67979, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 137.3, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fin_Latn-fin_Latn': {'number_of_characters': 66234, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-fin_Latn': {'number_of_characters': 66234, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fra_Latn-fra_Latn': {'number_of_characters': 82464, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 166.98, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fra_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-fra_Latn': {'number_of_characters': 82464, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 166.98, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fuv_Latn-fuv_Latn': {'number_of_characters': 53555, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 107.74, 'max_document_length': 122, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fuv_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-fuv_Latn': {'number_of_characters': 53555, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 107.74, 'max_document_length': 122, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'gaz_Latn-gaz_Latn': {'number_of_characters': 78315, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 158.48, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'gaz_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-gaz_Latn': {'number_of_characters': 78315, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 158.48, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'grn_Latn-grn_Latn': {'number_of_characters': 68572, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 138.52, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'grn_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-grn_Latn': {'number_of_characters': 68572, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 138.52, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'guj_Gujr-guj_Gujr': {'number_of_characters': 57007, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 114.82, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'guj_Gujr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-guj_Gujr': {'number_of_characters': 57007, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 114.82, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hat_Latn-hat_Latn': {'number_of_characters': 64558, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.29, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hat_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hat_Latn': {'number_of_characters': 64558, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.29, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hau_Latn-hau_Latn': {'number_of_characters': 78240, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.33, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hau_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hau_Latn': {'number_of_characters': 78240, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.33, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'heb_Hebr-heb_Hebr': {'number_of_characters': 50598, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 101.68, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'heb_Hebr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-heb_Hebr': {'number_of_characters': 50598, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 101.68, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Deva-hin_Deva': {'number_of_characters': 66332, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.93, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Deva-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hin_Deva': {'number_of_characters': 66332, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.93, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Latn-hin_Latn': {'number_of_characters': 68307, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.97, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hin_Latn': {'number_of_characters': 68307, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.97, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hrv_Latn-hrv_Latn': {'number_of_characters': 62928, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.95, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hrv_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hrv_Latn': {'number_of_characters': 62928, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.95, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hun_Latn-hun_Latn': {'number_of_characters': 67941, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 137.22, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hun_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hun_Latn': {'number_of_characters': 67941, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 137.22, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hye_Armn-hye_Armn': {'number_of_characters': 68859, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.1, 'max_document_length': 193, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hye_Armn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hye_Armn': {'number_of_characters': 68859, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.1, 'max_document_length': 193, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ibo_Latn-ibo_Latn': {'number_of_characters': 66167, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 133.59, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'ibo_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ibo_Latn': {'number_of_characters': 66167, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 133.59, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'ilo_Latn-ilo_Latn': {'number_of_characters': 78161, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.17, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ilo_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ilo_Latn': {'number_of_characters': 78161, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.17, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ind_Latn-ind_Latn': {'number_of_characters': 74871, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 151.42, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ind_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ind_Latn': {'number_of_characters': 74871, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 151.42, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'isl_Latn-isl_Latn': {'number_of_characters': 70522, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 142.51, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'isl_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-isl_Latn': {'number_of_characters': 70522, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 142.51, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ita_Latn-ita_Latn': {'number_of_characters': 76124, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 153.99, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ita_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ita_Latn': {'number_of_characters': 76124, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 153.99, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jav_Latn-jav_Latn': {'number_of_characters': 71722, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 144.97, 'max_document_length': 174, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jav_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-jav_Latn': {'number_of_characters': 71722, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 144.97, 'max_document_length': 174, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jpn_Jpan-jpn_Jpan': {'number_of_characters': 33187, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 66.01, 'max_document_length': 76, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jpn_Jpan-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-jpn_Jpan': {'number_of_characters': 33187, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 66.01, 'max_document_length': 76, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kac_Latn-kac_Latn': {'number_of_characters': 89655, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 181.72, 'max_document_length': 195, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kac_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kac_Latn': {'number_of_characters': 89655, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 181.72, 'max_document_length': 195, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kan_Knda-kan_Knda': {'number_of_characters': 65899, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.04, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kan_Knda-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kan_Knda': {'number_of_characters': 65899, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.04, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kat_Geor-kat_Geor': {'number_of_characters': 68309, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.98, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kat_Geor-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kat_Geor': {'number_of_characters': 68309, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.98, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kaz_Cyrl-kaz_Cyrl': {'number_of_characters': 64657, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.49, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kaz_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kaz_Cyrl': {'number_of_characters': 64657, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.49, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kea_Latn-kea_Latn': {'number_of_characters': 69323, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.06, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kea_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kea_Latn': {'number_of_characters': 69323, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.06, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khk_Cyrl-khk_Cyrl': {'number_of_characters': 66977, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 135.25, 'max_document_length': 162, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khk_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-khk_Cyrl': {'number_of_characters': 66977, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 135.25, 'max_document_length': 162, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khm_Khmr-khm_Khmr': {'number_of_characters': 69150, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 139.7, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khm_Khmr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-khm_Khmr': {'number_of_characters': 69150, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 139.7, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kin_Latn-kin_Latn': {'number_of_characters': 72803, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 147.19, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'kin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kin_Latn': {'number_of_characters': 72803, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 147.19, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'kir_Cyrl-kir_Cyrl': {'number_of_characters': 67957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 137.26, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kir_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kir_Cyrl': {'number_of_characters': 67957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 137.26, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kor_Hang-kor_Hang': {'number_of_characters': 32708, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 65.02, 'max_document_length': 88, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kor_Hang-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kor_Hang': {'number_of_characters': 32708, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 65.02, 'max_document_length': 88, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lao_Laoo-lao_Laoo': {'number_of_characters': 57958, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 116.77, 'max_document_length': 142, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lao_Laoo-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lao_Laoo': {'number_of_characters': 57958, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 116.77, 'max_document_length': 142, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lin_Latn-lin_Latn': {'number_of_characters': 74223, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 150.1, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'lin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lin_Latn': {'number_of_characters': 74223, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 150.1, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'lit_Latn-lit_Latn': {'number_of_characters': 62805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 126.7, 'max_document_length': 167, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lit_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lit_Latn': {'number_of_characters': 62805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 126.7, 'max_document_length': 167, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lug_Latn-lug_Latn': {'number_of_characters': 71566, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 144.65, 'max_document_length': 237, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'lug_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lug_Latn': {'number_of_characters': 71566, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 144.65, 'max_document_length': 237, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'luo_Latn-luo_Latn': {'number_of_characters': 66805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 134.9, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'luo_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-luo_Latn': {'number_of_characters': 66805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 134.9, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lvs_Latn-lvs_Latn': {'number_of_characters': 63957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 129.06, 'max_document_length': 172, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lvs_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lvs_Latn': {'number_of_characters': 63957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 129.06, 'max_document_length': 172, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mal_Mlym-mal_Mlym': {'number_of_characters': 73599, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.82, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mal_Mlym-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mal_Mlym': {'number_of_characters': 73599, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.82, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mar_Deva-mar_Deva': {'number_of_characters': 62671, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 126.42, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'mar_Deva-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mar_Deva': {'number_of_characters': 62671, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 126.42, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'mkd_Cyrl-mkd_Cyrl': {'number_of_characters': 67588, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 136.5, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mkd_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mkd_Cyrl': {'number_of_characters': 67588, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 136.5, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mlt_Latn-mlt_Latn': {'number_of_characters': 68480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 138.33, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mlt_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mlt_Latn': {'number_of_characters': 68480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 138.33, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mri_Latn-mri_Latn': {'number_of_characters': 74519, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 150.7, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mri_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mri_Latn': {'number_of_characters': 74519, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 150.7, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mya_Mymr-mya_Mymr': {'number_of_characters': 81331, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 164.66, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mya_Mymr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mya_Mymr': {'number_of_characters': 81331, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 164.66, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nld_Latn-nld_Latn': {'number_of_characters': 68789, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 138.96, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nld_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nld_Latn': {'number_of_characters': 68789, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 138.96, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nob_Latn-nob_Latn': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 131.03, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nob_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nob_Latn': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 131.03, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Deva-npi_Deva': {'number_of_characters': 61183, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 123.38, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Deva-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-npi_Deva': {'number_of_characters': 61183, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 123.38, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Latn-npi_Latn': {'number_of_characters': 65683, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 132.6, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-npi_Latn': {'number_of_characters': 65683, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 132.6, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nso_Latn-nso_Latn': {'number_of_characters': 79073, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 160.03, 'max_document_length': 235, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nso_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nso_Latn': {'number_of_characters': 79073, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 160.03, 'max_document_length': 235, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nya_Latn-nya_Latn': {'number_of_characters': 82685, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.44, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nya_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nya_Latn': {'number_of_characters': 82685, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.44, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ory_Orya-ory_Orya': {'number_of_characters': 66638, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 10, 'average_document_length': 134.55, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ory_Orya-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ory_Orya': {'number_of_characters': 66638, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 10, 'average_document_length': 134.55, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pan_Guru-pan_Guru': {'number_of_characters': 66944, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.18, 'max_document_length': 157, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pan_Guru-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pan_Guru': {'number_of_characters': 66944, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.18, 'max_document_length': 157, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pbt_Arab-pbt_Arab': {'number_of_characters': 61880, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 124.8, 'max_document_length': 155, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pbt_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pbt_Arab': {'number_of_characters': 61880, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 124.8, 'max_document_length': 155, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pes_Arab-pes_Arab': {'number_of_characters': 59252, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 119.42, 'max_document_length': 152, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pes_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pes_Arab': {'number_of_characters': 59252, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 119.42, 'max_document_length': 152, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'plt_Latn-plt_Latn': {'number_of_characters': 86472, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 175.2, 'max_document_length': 222, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'plt_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-plt_Latn': {'number_of_characters': 86472, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 175.2, 'max_document_length': 222, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pol_Latn-pol_Latn': {'number_of_characters': 67664, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 136.66, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pol_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pol_Latn': {'number_of_characters': 67664, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 136.66, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'por_Latn-por_Latn': {'number_of_characters': 71281, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.07, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'por_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-por_Latn': {'number_of_characters': 71281, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.07, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ron_Latn-ron_Latn': {'number_of_characters': 71844, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 145.22, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ron_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ron_Latn': {'number_of_characters': 71844, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 145.22, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'rus_Cyrl-rus_Cyrl': {'number_of_characters': 75823, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 153.38, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'rus_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-rus_Cyrl': {'number_of_characters': 75823, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 153.38, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'shn_Mymr-shn_Mymr': {'number_of_characters': 69288, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 139.98, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'shn_Mymr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-shn_Mymr': {'number_of_characters': 69288, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 139.98, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Latn-sin_Latn': {'number_of_characters': 85996, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 174.22, 'max_document_length': 224, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sin_Latn': {'number_of_characters': 85996, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 174.22, 'max_document_length': 224, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Sinh-sin_Sinh': {'number_of_characters': 63902, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 128.95, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Sinh-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sin_Sinh': {'number_of_characters': 63902, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 128.95, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slk_Latn-slk_Latn': {'number_of_characters': 62663, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 126.41, 'max_document_length': 146, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slk_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-slk_Latn': {'number_of_characters': 62663, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 126.41, 'max_document_length': 146, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slv_Latn-slv_Latn': {'number_of_characters': 62895, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.88, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slv_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-slv_Latn': {'number_of_characters': 62895, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.88, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sna_Latn-sna_Latn': {'number_of_characters': 74071, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.78, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sna_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sna_Latn': {'number_of_characters': 74071, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.78, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'snd_Arab-snd_Arab': {'number_of_characters': 58057, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 116.97, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'snd_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-snd_Arab': {'number_of_characters': 58057, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 116.97, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'som_Latn-som_Latn': {'number_of_characters': 82838, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.75, 'max_document_length': 201, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'som_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-som_Latn': {'number_of_characters': 82838, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.75, 'max_document_length': 201, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sot_Latn-sot_Latn': {'number_of_characters': 75794, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 153.32, 'max_document_length': 186, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sot_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sot_Latn': {'number_of_characters': 75794, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 153.32, 'max_document_length': 186, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'spa_Latn-spa_Latn': {'number_of_characters': 74920, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 151.52, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'spa_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-spa_Latn': {'number_of_characters': 74920, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 151.52, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'srp_Cyrl-srp_Cyrl': {'number_of_characters': 61657, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.35, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'srp_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-srp_Cyrl': {'number_of_characters': 61657, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.35, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'ssw_Latn-ssw_Latn': {'number_of_characters': 73964, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 149.57, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ssw_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ssw_Latn': {'number_of_characters': 73964, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 149.57, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sun_Latn-sun_Latn': {'number_of_characters': 71320, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 144.15, 'max_document_length': 173, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sun_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sun_Latn': {'number_of_characters': 71320, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 144.15, 'max_document_length': 173, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swe_Latn-swe_Latn': {'number_of_characters': 62785, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 126.66, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swe_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-swe_Latn': {'number_of_characters': 62785, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 126.66, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swh_Latn-swh_Latn': {'number_of_characters': 73480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.57, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swh_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-swh_Latn': {'number_of_characters': 73480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.57, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tam_Taml-tam_Taml': {'number_of_characters': 73991, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.62, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tam_Taml-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tam_Taml': {'number_of_characters': 73991, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.62, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tel_Telu-tel_Telu': {'number_of_characters': 65945, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 133.13, 'max_document_length': 149, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tel_Telu-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tel_Telu': {'number_of_characters': 65945, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 133.13, 'max_document_length': 149, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgk_Cyrl-tgk_Cyrl': {'number_of_characters': 67829, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 136.99, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgk_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tgk_Cyrl': {'number_of_characters': 67829, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 136.99, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgl_Latn-tgl_Latn': {'number_of_characters': 75087, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 151.87, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgl_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tgl_Latn': {'number_of_characters': 75087, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 151.87, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tha_Thai-tha_Thai': {'number_of_characters': 54496, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 109.67, 'max_document_length': 123, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tha_Thai-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tha_Thai': {'number_of_characters': 54496, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 109.67, 'max_document_length': 123, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tir_Ethi-tir_Ethi': {'number_of_characters': 47775, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 95.9, 'max_document_length': 110, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tir_Ethi-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tir_Ethi': {'number_of_characters': 47775, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 95.9, 'max_document_length': 110, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tsn_Latn-tsn_Latn': {'number_of_characters': 79391, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 160.69, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tsn_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tsn_Latn': {'number_of_characters': 79391, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 160.69, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tso_Latn-tso_Latn': {'number_of_characters': 83501, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 169.11, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tso_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tso_Latn': {'number_of_characters': 83501, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 169.11, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tur_Latn-tur_Latn': {'number_of_characters': 65382, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 131.98, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tur_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tur_Latn': {'number_of_characters': 65382, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 131.98, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ukr_Cyrl-ukr_Cyrl': {'number_of_characters': 65850, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 132.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ukr_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ukr_Cyrl': {'number_of_characters': 65850, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 132.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Arab-urd_Arab': {'number_of_characters': 64450, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 130.07, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-urd_Arab': {'number_of_characters': 64450, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 130.07, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Latn-urd_Latn': {'number_of_characters': 82039, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 166.11, 'max_document_length': 230, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-urd_Latn': {'number_of_characters': 82039, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 166.11, 'max_document_length': 230, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'uzn_Latn-uzn_Latn': {'number_of_characters': 70828, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 143.14, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'uzn_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-uzn_Latn': {'number_of_characters': 70828, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 143.14, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'vie_Latn-vie_Latn': {'number_of_characters': 66724, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 134.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'vie_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-vie_Latn': {'number_of_characters': 66724, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 134.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'war_Latn-war_Latn': {'number_of_characters': 78444, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 158.75, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'war_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-war_Latn': {'number_of_characters': 78444, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 158.75, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'wol_Latn-wol_Latn': {'number_of_characters': 64521, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 130.22, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'wol_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-wol_Latn': {'number_of_characters': 64521, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 130.22, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'xho_Latn-xho_Latn': {'number_of_characters': 71629, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.78, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'xho_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-xho_Latn': {'number_of_characters': 71629, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.78, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'yor_Latn-yor_Latn': {'number_of_characters': 62752, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 126.59, 'max_document_length': 143, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'yor_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-yor_Latn': {'number_of_characters': 62752, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 126.59, 'max_document_length': 143, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hans-zho_Hans': {'number_of_characters': 20549, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 40.11, 'max_document_length': 64, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hans-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zho_Hans': {'number_of_characters': 20549, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 40.11, 'max_document_length': 64, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hant-zho_Hant': {'number_of_characters': 19947, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 38.88, 'max_document_length': 45, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hant-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zho_Hant': {'number_of_characters': 19947, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 38.88, 'max_document_length': 45, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zsm_Latn-zsm_Latn': {'number_of_characters': 72008, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 145.56, 'max_document_length': 210, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zsm_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zsm_Latn': {'number_of_characters': 72008, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 145.56, 'max_document_length': 210, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zul_Latn-zul_Latn': {'number_of_characters': 69413, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.24, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zul_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zul_Latn': {'number_of_characters': 69413, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.24, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Arab-arb_Latn': {'number_of_characters': 61298, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 123.61, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Latn-arb_Arab': {'number_of_characters': 53671, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 107.98, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Beng-ben_Latn': {'number_of_characters': 68285, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 137.93, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Latn-ben_Beng': {'number_of_characters': 63512, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 128.15, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Deva-hin_Latn': {'number_of_characters': 68307, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.97, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Latn-hin_Deva': {'number_of_characters': 66332, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.93, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Deva-npi_Latn': {'number_of_characters': 65683, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 132.6, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Latn-npi_Deva': {'number_of_characters': 61183, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 123.38, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Sinh-sin_Latn': {'number_of_characters': 85996, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 174.22, 'max_document_length': 224, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Latn-sin_Sinh': {'number_of_characters': 63902, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 128.95, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Arab-urd_Latn': {'number_of_characters': 82039, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 166.11, 'max_document_length': 230, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Latn-urd_Arab': {'number_of_characters': 64450, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 130.07, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}}}} | | [BengaliDocumentClassification.v2](https://aclanthology.org/2023.eacl-main.4) (Akash et al., 2023) | ['ben'] | Classification | s2s | [News, Written] | None | None | | [BengaliHateSpeechClassification.v2](https://huggingface.co/datasets/bn_hate_speech) (Karim et al., 2020) | ['ben'] | Classification | s2s | [News, Written] | None | None | @@ -60,6 +68,8 @@ The following tables give you an overview of the tasks in MTEB. | [BeytooteClustering](https://mcinext.com/) | ['fas'] | Clustering | p2p | [News] | None | None | | [BibleNLPBitextMining](https://arxiv.org/abs/2304.09919) (Akerman et al., 2023) | ['aai', 'aak', 'aau', 'aaz', 'abt', 'abx', 'aby', 'acf', 'acr', 'acu', 'adz', 'aer', 'aey', 'agd', 'agg', 'agm', 'agn', 'agr', 'agt', 'agu', 'aia', 'aii', 'aka', 'ake', 'alp', 'alq', 'als', 'aly', 'ame', 'amf', 'amk', 'amm', 'amn', 'amo', 'amp', 'amr', 'amu', 'amx', 'anh', 'anv', 'aoi', 'aoj', 'aom', 'aon', 'apb', 'ape', 'apn', 'apr', 'apu', 'apw', 'apz', 'arb', 'are', 'arl', 'arn', 'arp', 'asm', 'aso', 'ata', 'atb', 'atd', 'atg', 'att', 'auc', 'aui', 'auy', 'avt', 'awb', 'awk', 'awx', 'azb', 'azg', 'azz', 'bao', 'bba', 'bbb', 'bbr', 'bch', 'bco', 'bdd', 'bea', 'bef', 'bel', 'ben', 'beo', 'beu', 'bgs', 'bgt', 'bhg', 'bhl', 'big', 'bjk', 'bjp', 'bjr', 'bjv', 'bjz', 'bkd', 'bki', 'bkq', 'bkx', 'blw', 'blz', 'bmh', 'bmk', 'bmr', 'bmu', 'bnp', 'boa', 'boj', 'bon', 'box', 'bpr', 'bps', 'bqc', 'bqp', 'bre', 'bsj', 'bsn', 'bsp', 'bss', 'buk', 'bus', 'bvd', 'bvr', 'bxh', 'byr', 'byx', 'bzd', 'bzh', 'bzj', 'caa', 'cab', 'cac', 'caf', 'cak', 'cao', 'cap', 'car', 'cav', 'cax', 'cbc', 'cbi', 'cbk', 'cbr', 'cbs', 'cbt', 'cbu', 'cbv', 'cco', 'ceb', 'cek', 'ces', 'cgc', 'cha', 'chd', 'chf', 'chk', 'chq', 'chz', 'cjo', 'cjv', 'ckb', 'cle', 'clu', 'cme', 'cmn', 'cni', 'cnl', 'cnt', 'cof', 'con', 'cop', 'cot', 'cpa', 'cpb', 'cpc', 'cpu', 'cpy', 'crn', 'crx', 'cso', 'csy', 'cta', 'cth', 'ctp', 'ctu', 'cub', 'cuc', 'cui', 'cuk', 'cut', 'cux', 'cwe', 'cya', 'daa', 'dad', 'dah', 'dan', 'ded', 'deu', 'dgc', 'dgr', 'dgz', 'dhg', 'dif', 'dik', 'dji', 'djk', 'djr', 'dob', 'dop', 'dov', 'dwr', 'dww', 'dwy', 'ebk', 'eko', 'emi', 'emp', 'eng', 'enq', 'epo', 'eri', 'ese', 'esk', 'etr', 'ewe', 'faa', 'fai', 'far', 'ffm', 'for', 'fra', 'fue', 'fuf', 'fuh', 'gah', 'gai', 'gam', 'gaw', 'gdn', 'gdr', 'geb', 'gfk', 'ghs', 'glk', 'gmv', 'gng', 'gnn', 'gnw', 'gof', 'grc', 'gub', 'guh', 'gui', 'guj', 'gul', 'gum', 'gun', 'guo', 'gup', 'gux', 'gvc', 'gvf', 'gvn', 'gvs', 'gwi', 'gym', 'gyr', 'hat', 'hau', 'haw', 'hbo', 'hch', 'heb', 'heg', 'hin', 'hix', 'hla', 'hlt', 'hmo', 'hns', 'hop', 'hot', 'hrv', 'hto', 'hub', 'hui', 'hun', 'hus', 'huu', 'huv', 'hvn', 'ian', 'ign', 'ikk', 'ikw', 'ilo', 'imo', 'inb', 'ind', 'ino', 'iou', 'ipi', 'isn', 'ita', 'iws', 'ixl', 'jac', 'jae', 'jao', 'jic', 'jid', 'jiv', 'jni', 'jpn', 'jvn', 'kan', 'kaq', 'kbc', 'kbh', 'kbm', 'kbq', 'kdc', 'kde', 'kdl', 'kek', 'ken', 'kew', 'kgf', 'kgk', 'kgp', 'khs', 'khz', 'kik', 'kiw', 'kiz', 'kje', 'kjs', 'kkc', 'kkl', 'klt', 'klv', 'kmg', 'kmh', 'kmk', 'kmo', 'kms', 'kmu', 'kne', 'knf', 'knj', 'knv', 'kos', 'kpf', 'kpg', 'kpj', 'kpr', 'kpw', 'kpx', 'kqa', 'kqc', 'kqf', 'kql', 'kqw', 'ksd', 'ksj', 'ksr', 'ktm', 'kto', 'kud', 'kue', 'kup', 'kvg', 'kvn', 'kwd', 'kwf', 'kwi', 'kwj', 'kyc', 'kyf', 'kyg', 'kyq', 'kyz', 'kze', 'lac', 'lat', 'lbb', 'lbk', 'lcm', 'leu', 'lex', 'lgl', 'lid', 'lif', 'lin', 'lit', 'llg', 'lug', 'luo', 'lww', 'maa', 'maj', 'mal', 'mam', 'maq', 'mar', 'mau', 'mav', 'maz', 'mbb', 'mbc', 'mbh', 'mbj', 'mbl', 'mbs', 'mbt', 'mca', 'mcb', 'mcd', 'mcf', 'mco', 'mcp', 'mcq', 'mcr', 'mdy', 'med', 'mee', 'mek', 'meq', 'met', 'meu', 'mgc', 'mgh', 'mgw', 'mhl', 'mib', 'mic', 'mie', 'mig', 'mih', 'mil', 'mio', 'mir', 'mit', 'miz', 'mjc', 'mkj', 'mkl', 'mkn', 'mks', 'mle', 'mlh', 'mlp', 'mmo', 'mmx', 'mna', 'mop', 'mox', 'mph', 'mpj', 'mpm', 'mpp', 'mps', 'mpt', 'mpx', 'mqb', 'mqj', 'msb', 'msc', 'msk', 'msm', 'msy', 'mti', 'mto', 'mux', 'muy', 'mva', 'mvn', 'mwc', 'mwe', 'mwf', 'mwp', 'mxb', 'mxp', 'mxq', 'mxt', 'mya', 'myk', 'myu', 'myw', 'myy', 'mzz', 'nab', 'naf', 'nak', 'nas', 'nbq', 'nca', 'nch', 'ncj', 'ncl', 'ncu', 'ndg', 'ndj', 'nfa', 'ngp', 'ngu', 'nhe', 'nhg', 'nhi', 'nho', 'nhr', 'nhu', 'nhw', 'nhy', 'nif', 'nii', 'nin', 'nko', 'nld', 'nlg', 'nna', 'nnq', 'noa', 'nop', 'not', 'nou', 'npi', 'npl', 'nsn', 'nss', 'ntj', 'ntp', 'ntu', 'nuy', 'nvm', 'nwi', 'nya', 'nys', 'nyu', 'obo', 'okv', 'omw', 'ong', 'ons', 'ood', 'opm', 'ory', 'ote', 'otm', 'otn', 'otq', 'ots', 'pab', 'pad', 'pah', 'pan', 'pao', 'pes', 'pib', 'pio', 'pir', 'piu', 'pjt', 'pls', 'plu', 'pma', 'poe', 'poh', 'poi', 'pol', 'pon', 'por', 'poy', 'ppo', 'prf', 'pri', 'ptp', 'ptu', 'pwg', 'qub', 'quc', 'quf', 'quh', 'qul', 'qup', 'qvc', 'qve', 'qvh', 'qvm', 'qvn', 'qvs', 'qvw', 'qvz', 'qwh', 'qxh', 'qxn', 'qxo', 'rai', 'reg', 'rgu', 'rkb', 'rmc', 'rmy', 'ron', 'roo', 'rop', 'row', 'rro', 'ruf', 'rug', 'rus', 'rwo', 'sab', 'san', 'sbe', 'sbk', 'sbs', 'seh', 'sey', 'sgb', 'sgz', 'shj', 'shp', 'sim', 'sja', 'sll', 'smk', 'snc', 'snn', 'snp', 'snx', 'sny', 'som', 'soq', 'soy', 'spa', 'spl', 'spm', 'spp', 'sps', 'spy', 'sri', 'srm', 'srn', 'srp', 'srq', 'ssd', 'ssg', 'ssx', 'stp', 'sua', 'sue', 'sus', 'suz', 'swe', 'swh', 'swp', 'sxb', 'tac', 'taj', 'tam', 'tav', 'taw', 'tbc', 'tbf', 'tbg', 'tbo', 'tbz', 'tca', 'tcs', 'tcz', 'tdt', 'tee', 'tel', 'ter', 'tet', 'tew', 'tfr', 'tgk', 'tgl', 'tgo', 'tgp', 'tha', 'tif', 'tim', 'tiw', 'tiy', 'tke', 'tku', 'tlf', 'tmd', 'tna', 'tnc', 'tnk', 'tnn', 'tnp', 'toc', 'tod', 'tof', 'toj', 'ton', 'too', 'top', 'tos', 'tpa', 'tpi', 'tpt', 'tpz', 'trc', 'tsw', 'ttc', 'tte', 'tuc', 'tue', 'tuf', 'tuo', 'tur', 'tvk', 'twi', 'txq', 'txu', 'tzj', 'tzo', 'ubr', 'ubu', 'udu', 'uig', 'ukr', 'uli', 'ulk', 'upv', 'ura', 'urb', 'urd', 'uri', 'urt', 'urw', 'usa', 'usp', 'uvh', 'uvl', 'vid', 'vie', 'viv', 'vmy', 'waj', 'wal', 'wap', 'wat', 'wbi', 'wbp', 'wed', 'wer', 'wim', 'wiu', 'wiv', 'wmt', 'wmw', 'wnc', 'wnu', 'wol', 'wos', 'wrk', 'wro', 'wrs', 'wsk', 'wuv', 'xav', 'xbi', 'xed', 'xla', 'xnn', 'xon', 'xsi', 'xtd', 'xtm', 'yaa', 'yad', 'yal', 'yap', 'yaq', 'yby', 'ycn', 'yka', 'yle', 'yml', 'yon', 'yor', 'yrb', 'yre', 'yss', 'yuj', 'yut', 'yuw', 'yva', 'zaa', 'zab', 'zac', 'zad', 'zai', 'zaj', 'zam', 'zao', 'zap', 'zar', 'zas', 'zat', 'zav', 'zaw', 'zca', 'zga', 'zia', 'ziw', 'zlm', 'zos', 'zpc', 'zpl', 'zpm', 'zpo', 'zpq', 'zpu', 'zpv', 'zpz', 'zsr', 'ztq', 'zty', 'zyp'] | BitextMining | s2s | [Religious, Written] | None | None | | [BigPatentClustering.v2](https://huggingface.co/datasets/NortheasternUniversity/big_patent) (Eva Sharma and Chen Li and Lu Wang, 2019) | ['eng'] | Clustering | p2p | [Legal, Written] | None | None | +| [BillSumCA](https://huggingface.co/datasets/FiscalNote/billsum) (Eidelman et al., 2019) | ['eng'] | Retrieval | t2t | [Government, Legal] | None | None | +| [BillSumUS](https://huggingface.co/datasets/FiscalNote/billsum) (Eidelman et al., 2019) | ['eng'] | Retrieval | t2t | [Government, Legal] | None | None | | [BiorxivClusteringP2P.v2](https://api.biorxiv.org/) | ['eng'] | Clustering | p2p | [Academic, Written] | None | None | | [BiorxivClusteringS2S.v2](https://api.biorxiv.org/) | ['eng'] | Clustering | s2s | [Academic, Written] | None | None | | [Birdsnap](https://openaccess.thecvf.com/content_cvpr_2014/html/Berg_Birdsnap_Large-scale_Fine-grained_2014_CVPR_paper.html) (Berg et al., 2014) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 1851} | {'test': {'num_samples': 1851, 'unique_num_labels': 490, 'min_image_width': 267, 'average_image_width': 2081.56, 'max_image_width': 6400, 'min_image_height': 200, 'average_image_height': 1609.19, 'max_image_height': 5400, 'labels': {'0': {'count': 4}, '1': {'count': 5}, '2': {'count': 4}, '3': {'count': 4}, '4': {'count': 4}, '5': {'count': 2}, '6': {'count': 3}, '7': {'count': 5}, '8': {'count': 4}, '9': {'count': 5}, '11': {'count': 3}, '12': {'count': 4}, '13': {'count': 5}, '14': {'count': 4}, '15': {'count': 5}, '16': {'count': 4}, '17': {'count': 3}, '18': {'count': 2}, '19': {'count': 5}, '20': {'count': 4}, '21': {'count': 4}, '22': {'count': 5}, '23': {'count': 2}, '24': {'count': 4}, '25': {'count': 3}, '26': {'count': 4}, '27': {'count': 4}, '28': {'count': 2}, '29': {'count': 5}, '30': {'count': 3}, '31': {'count': 3}, '32': {'count': 3}, '33': {'count': 4}, '34': {'count': 4}, '35': {'count': 4}, '36': {'count': 3}, '37': {'count': 3}, '38': {'count': 4}, '39': {'count': 3}, '40': {'count': 4}, '41': {'count': 3}, '42': {'count': 3}, '43': {'count': 4}, '44': {'count': 2}, '45': {'count': 3}, '47': {'count': 5}, '48': {'count': 2}, '49': {'count': 5}, '50': {'count': 4}, '51': {'count': 5}, '52': {'count': 3}, '53': {'count': 3}, '54': {'count': 4}, '55': {'count': 2}, '56': {'count': 2}, '57': {'count': 5}, '58': {'count': 2}, '59': {'count': 1}, '60': {'count': 1}, '61': {'count': 3}, '62': {'count': 3}, '63': {'count': 5}, '64': {'count': 5}, '65': {'count': 4}, '67': {'count': 2}, '68': {'count': 3}, '69': {'count': 4}, '70': {'count': 5}, '71': {'count': 5}, '72': {'count': 5}, '73': {'count': 4}, '74': {'count': 5}, '75': {'count': 4}, '76': {'count': 4}, '80': {'count': 3}, '81': {'count': 5}, '82': {'count': 3}, '83': {'count': 5}, '84': {'count': 3}, '85': {'count': 4}, '86': {'count': 4}, '87': {'count': 5}, '88': {'count': 4}, '89': {'count': 5}, '90': {'count': 4}, '91': {'count': 4}, '92': {'count': 5}, '93': {'count': 4}, '94': {'count': 4}, '95': {'count': 5}, '96': {'count': 5}, '97': {'count': 5}, '98': {'count': 3}, '99': {'count': 5}, '100': {'count': 4}, '101': {'count': 5}, '102': {'count': 4}, '103': {'count': 3}, '105': {'count': 4}, '108': {'count': 4}, '109': {'count': 5}, '110': {'count': 3}, '111': {'count': 3}, '112': {'count': 4}, '113': {'count': 4}, '114': {'count': 5}, '115': {'count': 4}, '116': {'count': 5}, '117': {'count': 4}, '118': {'count': 4}, '119': {'count': 5}, '120': {'count': 5}, '121': {'count': 4}, '122': {'count': 3}, '124': {'count': 3}, '125': {'count': 4}, '126': {'count': 2}, '127': {'count': 3}, '128': {'count': 5}, '129': {'count': 5}, '130': {'count': 5}, '131': {'count': 3}, '132': {'count': 4}, '133': {'count': 4}, '134': {'count': 2}, '135': {'count': 5}, '136': {'count': 5}, '137': {'count': 3}, '138': {'count': 4}, '139': {'count': 3}, '140': {'count': 3}, '141': {'count': 2}, '142': {'count': 3}, '143': {'count': 5}, '144': {'count': 4}, '145': {'count': 5}, '146': {'count': 5}, '147': {'count': 5}, '148': {'count': 4}, '149': {'count': 4}, '150': {'count': 5}, '151': {'count': 5}, '152': {'count': 5}, '153': {'count': 3}, '154': {'count': 4}, '155': {'count': 3}, '156': {'count': 3}, '157': {'count': 3}, '159': {'count': 3}, '160': {'count': 4}, '161': {'count': 4}, '162': {'count': 4}, '163': {'count': 4}, '164': {'count': 3}, '165': {'count': 3}, '166': {'count': 3}, '167': {'count': 4}, '168': {'count': 4}, '169': {'count': 4}, '170': {'count': 4}, '171': {'count': 5}, '172': {'count': 4}, '173': {'count': 4}, '174': {'count': 5}, '175': {'count': 4}, '176': {'count': 2}, '177': {'count': 5}, '178': {'count': 5}, '179': {'count': 5}, '180': {'count': 5}, '181': {'count': 4}, '183': {'count': 2}, '184': {'count': 3}, '185': {'count': 2}, '186': {'count': 5}, '187': {'count': 2}, '188': {'count': 3}, '189': {'count': 2}, '190': {'count': 5}, '191': {'count': 4}, '192': {'count': 3}, '193': {'count': 3}, '194': {'count': 4}, '195': {'count': 3}, '196': {'count': 4}, '197': {'count': 3}, '198': {'count': 4}, '199': {'count': 5}, '200': {'count': 5}, '201': {'count': 1}, '204': {'count': 4}, '205': {'count': 5}, '206': {'count': 4}, '207': {'count': 3}, '208': {'count': 4}, '209': {'count': 4}, '210': {'count': 4}, '211': {'count': 4}, '212': {'count': 5}, '213': {'count': 4}, '214': {'count': 5}, '215': {'count': 3}, '216': {'count': 1}, '217': {'count': 5}, '218': {'count': 2}, '219': {'count': 5}, '220': {'count': 4}, '221': {'count': 5}, '222': {'count': 5}, '223': {'count': 3}, '224': {'count': 4}, '225': {'count': 5}, '226': {'count': 3}, '227': {'count': 4}, '228': {'count': 3}, '229': {'count': 4}, '230': {'count': 4}, '231': {'count': 5}, '232': {'count': 5}, '233': {'count': 5}, '234': {'count': 4}, '235': {'count': 4}, '236': {'count': 5}, '237': {'count': 5}, '238': {'count': 5}, '239': {'count': 4}, '240': {'count': 3}, '241': {'count': 3}, '242': {'count': 4}, '243': {'count': 5}, '244': {'count': 2}, '245': {'count': 4}, '246': {'count': 5}, '247': {'count': 3}, '248': {'count': 3}, '249': {'count': 5}, '250': {'count': 5}, '251': {'count': 4}, '252': {'count': 2}, '253': {'count': 5}, '254': {'count': 5}, '255': {'count': 5}, '256': {'count': 4}, '257': {'count': 4}, '258': {'count': 4}, '259': {'count': 3}, '260': {'count': 5}, '261': {'count': 4}, '262': {'count': 4}, '264': {'count': 4}, '265': {'count': 3}, '266': {'count': 5}, '267': {'count': 5}, '268': {'count': 3}, '269': {'count': 2}, '270': {'count': 3}, '271': {'count': 4}, '272': {'count': 4}, '273': {'count': 5}, '274': {'count': 5}, '275': {'count': 5}, '276': {'count': 2}, '277': {'count': 3}, '278': {'count': 5}, '279': {'count': 5}, '280': {'count': 4}, '281': {'count': 5}, '282': {'count': 5}, '283': {'count': 3}, '284': {'count': 5}, '285': {'count': 3}, '286': {'count': 5}, '287': {'count': 5}, '288': {'count': 4}, '289': {'count': 4}, '290': {'count': 5}, '291': {'count': 3}, '292': {'count': 2}, '293': {'count': 1}, '294': {'count': 1}, '295': {'count': 2}, '296': {'count': 4}, '297': {'count': 5}, '298': {'count': 4}, '300': {'count': 3}, '301': {'count': 3}, '303': {'count': 4}, '304': {'count': 4}, '305': {'count': 4}, '306': {'count': 2}, '307': {'count': 5}, '308': {'count': 4}, '309': {'count': 2}, '310': {'count': 3}, '311': {'count': 3}, '312': {'count': 4}, '313': {'count': 3}, '314': {'count': 3}, '315': {'count': 3}, '316': {'count': 5}, '317': {'count': 4}, '318': {'count': 5}, '319': {'count': 4}, '320': {'count': 4}, '321': {'count': 3}, '322': {'count': 5}, '323': {'count': 4}, '324': {'count': 2}, '325': {'count': 1}, '326': {'count': 3}, '327': {'count': 4}, '328': {'count': 3}, '330': {'count': 4}, '331': {'count': 4}, '332': {'count': 2}, '333': {'count': 5}, '334': {'count': 5}, '335': {'count': 5}, '336': {'count': 4}, '337': {'count': 4}, '338': {'count': 5}, '339': {'count': 3}, '340': {'count': 5}, '341': {'count': 5}, '342': {'count': 5}, '343': {'count': 2}, '344': {'count': 2}, '345': {'count': 3}, '346': {'count': 3}, '347': {'count': 5}, '348': {'count': 3}, '349': {'count': 2}, '350': {'count': 4}, '352': {'count': 5}, '353': {'count': 3}, '354': {'count': 5}, '355': {'count': 5}, '356': {'count': 4}, '357': {'count': 3}, '358': {'count': 3}, '359': {'count': 4}, '360': {'count': 5}, '361': {'count': 5}, '362': {'count': 4}, '363': {'count': 3}, '364': {'count': 4}, '365': {'count': 1}, '366': {'count': 4}, '367': {'count': 3}, '368': {'count': 4}, '369': {'count': 3}, '370': {'count': 5}, '371': {'count': 3}, '372': {'count': 5}, '373': {'count': 4}, '374': {'count': 4}, '375': {'count': 3}, '376': {'count': 4}, '377': {'count': 4}, '378': {'count': 4}, '379': {'count': 4}, '380': {'count': 4}, '381': {'count': 4}, '382': {'count': 1}, '383': {'count': 4}, '384': {'count': 4}, '385': {'count': 4}, '386': {'count': 2}, '387': {'count': 4}, '388': {'count': 2}, '389': {'count': 5}, '390': {'count': 4}, '391': {'count': 5}, '392': {'count': 4}, '394': {'count': 4}, '395': {'count': 4}, '396': {'count': 4}, '397': {'count': 4}, '398': {'count': 5}, '399': {'count': 4}, '400': {'count': 5}, '401': {'count': 4}, '402': {'count': 4}, '404': {'count': 5}, '405': {'count': 5}, '406': {'count': 5}, '407': {'count': 4}, '408': {'count': 2}, '409': {'count': 4}, '410': {'count': 3}, '411': {'count': 5}, '412': {'count': 4}, '413': {'count': 3}, '414': {'count': 4}, '415': {'count': 4}, '416': {'count': 4}, '417': {'count': 5}, '418': {'count': 3}, '419': {'count': 5}, '421': {'count': 4}, '422': {'count': 3}, '423': {'count': 5}, '424': {'count': 5}, '425': {'count': 2}, '426': {'count': 5}, '427': {'count': 4}, '428': {'count': 5}, '429': {'count': 3}, '430': {'count': 2}, '431': {'count': 3}, '432': {'count': 5}, '433': {'count': 4}, '434': {'count': 3}, '435': {'count': 3}, '437': {'count': 3}, '438': {'count': 5}, '439': {'count': 2}, '440': {'count': 4}, '441': {'count': 4}, '442': {'count': 5}, '443': {'count': 2}, '444': {'count': 3}, '445': {'count': 3}, '446': {'count': 5}, '447': {'count': 3}, '448': {'count': 2}, '449': {'count': 1}, '450': {'count': 3}, '451': {'count': 3}, '452': {'count': 4}, '453': {'count': 2}, '454': {'count': 4}, '455': {'count': 4}, '456': {'count': 5}, '458': {'count': 4}, '459': {'count': 4}, '460': {'count': 5}, '461': {'count': 4}, '462': {'count': 4}, '463': {'count': 5}, '464': {'count': 5}, '466': {'count': 2}, '467': {'count': 4}, '468': {'count': 3}, '469': {'count': 5}, '470': {'count': 5}, '471': {'count': 2}, '472': {'count': 4}, '473': {'count': 3}, '474': {'count': 5}, '475': {'count': 5}, '476': {'count': 5}, '477': {'count': 4}, '478': {'count': 2}, '479': {'count': 4}, '480': {'count': 4}, '481': {'count': 5}, '482': {'count': 4}, '483': {'count': 3}, '484': {'count': 5}, '485': {'count': 5}, '486': {'count': 4}, '487': {'count': 3}, '488': {'count': 3}, '489': {'count': 1}, '490': {'count': 1}, '491': {'count': 2}, '492': {'count': 4}, '493': {'count': 4}, '494': {'count': 3}, '495': {'count': 4}, '496': {'count': 5}, '497': {'count': 5}, '498': {'count': 5}, '499': {'count': 4}, '79': {'count': 4}, '106': {'count': 4}, '107': {'count': 4}, '202': {'count': 1}, '203': {'count': 1}, '457': {'count': 3}, '77': {'count': 2}, '78': {'count': 4}, '182': {'count': 2}, '263': {'count': 4}, '104': {'count': 1}, '158': {'count': 5}, '329': {'count': 1}, '393': {'count': 2}, '420': {'count': 2}}}} | @@ -75,7 +85,7 @@ The following tables give you an overview of the tasks in MTEB. | [BuiltBenchReranking](https://arxiv.org/abs/2411.12056) (Shahinmoghadam et al., 2024) | ['eng'] | Reranking | p2p | [Engineering, Written] | {'test': 179} | {'test': {'num_samples': 179, 'number_of_characters': 1572224, 'num_positive': 1253, 'num_negative': 3759, 'min_query_length': 35, 'avg_query_length': 140.44, 'max_query_length': 307, 'unique_query': 82, 'min_positive_length': 190, 'avg_positive_length': 311.32, 'max_positive_length': 477, 'unique_positive': 1253, 'min_negative_length': 166, 'avg_negative_length': 307.79, 'max_negative_length': 508, 'unique_negative': 2241}} | | [BuiltBenchRetrieval](https://arxiv.org/abs/2411.12056) (Shahinmoghadam et al., 2024) | ['eng'] | Retrieval | p2p | [Engineering, Written] | None | None | | [BulgarianStoreReviewSentimentClassfication](https://doi.org/10.7910/DVN/TXIK9P) (Georgieva-Trifonova et al., 2018) | ['bul'] | Classification | s2s | [Reviews, Written] | None | None | -| [CBD](http://2019.poleval.pl/files/poleval2019.pdf) | ['pol'] | Classification | s2s | [Social, Written] | None | None | +| [CBD.v2](http://2019.poleval.pl/files/poleval2019.pdf) | ['pol'] | Classification | s2s | [Social, Written] | None | None | | [CDSC-E](https://aclanthology.org/P17-1073.pdf) (Wr{\'o, 2017) | ['pol'] | PairClassification | s2s | [Written] | None | None | | [CDSC-R](https://aclanthology.org/P17-1073.pdf) (Wr{\'o, 2017) | ['pol'] | STS | s2s | [Web, Written] | None | None | | [CEDRClassification](https://www.sciencedirect.com/science/article/pii/S1877050921013247) (Sboev et al., 2021) | ['rus'] | MultilabelClassification | s2s | [Blog, Social, Web, Written] | {'test': 1882, 'train': 7528} | {'test': {'num_samples': 1882, 'number_of_characters': 171649, 'number_texts_in_train': 7, 'min_text_length': 6, 'average_text_length': 91.21, 'max_text_length': 220, 'unique_texts': 1875, 'min_labels_per_text': 0, 'average_label_per_text': 0.62, 'max_labels_per_text': 2, 'unique_labels': 6, 'labels': {'None': {'count': 734}, '3': {'count': 141}, '2': {'count': 170}, '1': {'count': 379}, '0': {'count': 353}, '4': {'count': 125}}}, 'train': {'num_samples': 7528, 'number_of_characters': 697322, 'number_texts_in_train': None, 'min_text_length': 5, 'average_text_length': 92.63, 'max_text_length': 280, 'unique_texts': 7500, 'min_labels_per_text': 0, 'average_label_per_text': 0.61, 'max_labels_per_text': 3, 'unique_labels': 6, 'labels': {'None': {'count': 3043}, '2': {'count': 607}, '0': {'count': 1569}, '3': {'count': 589}, '1': {'count': 1417}, '4': {'count': 411}}}} | @@ -97,6 +107,7 @@ The following tables give you an overview of the tasks in MTEB. | [CPUSpeedTask](https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/blob/c8376f967d1294419be1d3eb41217d04cd3a65d3/src/seb/registered_tasks/speed.py#L83-L96) | ['eng'] | Speed | s2s | [Fiction, Written] | None | None | | [CQADupstack-NL](https://huggingface.co/datasets/clips/beir-nl-cqadupstack) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Non-fiction, Written] | None | None | | [CQADupstackAndroid-NL](https://huggingface.co/datasets/clips/beir-nl-cqadupstack) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Non-fiction, Written] | None | None | +| [CQADupstackAndroid-VN](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Non-fiction, Programming, Web, Written] | None | None | | [CQADupstackAndroidRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Non-fiction, Programming, Web, Written] | None | None | | [CQADupstackAndroidRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-android-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackEnglish-NL](https://huggingface.co/datasets/clips/beir-nl-cqadupstack) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Non-fiction, Written] | None | None | @@ -106,36 +117,45 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackGamingRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | | [CQADupstackGamingRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-gaming-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackGis-NL](https://huggingface.co/datasets/clips/beir-nl-cqadupstack) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Non-fiction, Written] | None | None | +| [CQADupstackGis-VN](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Non-fiction, Written] | None | None | | [CQADupstackGisRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Non-fiction, Written] | None | None | | [CQADupstackGisRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-gis-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackMathematica-NL](https://huggingface.co/datasets/clips/beir-nl-cqadupstack) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Non-fiction, Written] | None | None | +| [CQADupstackMathematica-VN](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | | [CQADupstackMathematicaRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-mathematica-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackPhysics-NL](https://huggingface.co/datasets/clips/beir-nl-cqadupstack) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Non-fiction, Written] | None | None | +| [CQADupstackPhysics-VN](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammers-NL](https://huggingface.co/datasets/clips/beir-nl-cqadupstack) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Non-fiction, Written] | None | None | +| [CQADupstackProgrammers-VN](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Non-fiction, Programming, Written] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Non-fiction, Programming, Written] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Non-fiction, Programming, Web, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng', 'vie'] | Retrieval | None | [Academic, Non-fiction, Programming, Web, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStats-NL](https://huggingface.co/datasets/clips/beir-nl-cqadupstack) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Non-fiction, Written] | None | None | +| [CQADupstackStats-VN](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackTex-NL](https://huggingface.co/datasets/clips/beir-nl-cqadupstack) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Non-fiction, Written] | None | None | +| [CQADupstackTex-VN](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Non-fiction, Written] | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Non-fiction, Written] | None | None | | [CQADupstackTexRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-tex-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackUnix-NL](https://huggingface.co/datasets/clips/beir-nl-cqadupstack) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Non-fiction, Written] | None | None | +| [CQADupstackUnix-VN](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Programming, Web, Written] | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Web, Written] | None | None | | [CQADupstackUnixRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-unix-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackWebmasters-NL](https://huggingface.co/datasets/clips/beir-nl-cqadupstack) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Non-fiction, Written] | None | None | +| [CQADupstackWebmasters-VN](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Web, Written] | None | None | | [CQADupstackWebmastersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | | [CQADupstackWebmastersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-webmasters-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackWordpress-NL](https://huggingface.co/datasets/clips/beir-nl-cqadupstack) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Non-fiction, Written] | None | None | +| [CQADupstackWordpress-VN](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Programming, Web, Written] | None | None | | [CQADupstackWordpressRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Web, Written] | None | None | | [CQADupstackWordpressRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-wordpress-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CSFDCZMovieReviewSentimentClassification](https://arxiv.org/abs/2304.01922) (Michal Štefánik, 2023) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | -| [CSFDSKMovieReviewSentimentClassification](https://arxiv.org/abs/2304.01922) (Michal Štefánik, 2023) | ['slk'] | Classification | s2s | [Reviews, Written] | None | None | +| [CSFDCZMovieReviewSentimentClassification.v2](https://arxiv.org/abs/2304.01922) (Michal Štefánik, 2023) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | +| [CSFDSKMovieReviewSentimentClassification.v2](https://arxiv.org/abs/2304.01922) (Michal Štefánik, 2023) | ['slk'] | Classification | s2s | [Reviews, Written] | None | None | | [CTKFactsNLI](https://arxiv.org/abs/2201.11115) (Ullrich et al., 2023) | ['ces'] | PairClassification | s2s | [News, Written] | None | None | | [CUADAffiliateLicenseLicenseeLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [CUADAffiliateLicenseLicensorLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | @@ -185,11 +205,13 @@ The following tables give you an overview of the tasks in MTEB. | [Caltech101ZeroShot](https://ieeexplore.ieee.org/document/1384978) (Li Fei-Fei, 2004) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | {'test': 1986} | {'test': {'num_samples': 1986, 'unique_num_labels': 63, 'min_image_width': 105, 'average_image_width': 277.19, 'max_image_width': 300, 'min_image_height': 114, 'average_image_height': 255.33, 'max_image_height': 300, 'min_label_text_length': 17, 'average_label_text_length': 21.88, 'max_label_text_length': 31, 'labels': {'36': {'count': 55}, '39': {'count': 37}, '40': {'count': 37}, '41': {'count': 15}, '42': {'count': 4}, '43': {'count': 4}, '44': {'count': 21}, '45': {'count': 69}, '46': {'count': 70}, '47': {'count': 12}, '48': {'count': 24}, '49': {'count': 58}, '50': {'count': 50}, '51': {'count': 1}, '52': {'count': 34}, '53': {'count': 56}, '54': {'count': 84}, '55': {'count': 31}, '56': {'count': 51}, '58': {'count': 48}, '59': {'count': 11}, '60': {'count': 36}, '61': {'count': 13}, '62': {'count': 10}, '63': {'count': 57}, '64': {'count': 2}, '65': {'count': 46}, '67': {'count': 25}, '68': {'count': 5}, '69': {'count': 9}, '70': {'count': 17}, '71': {'count': 8}, '72': {'count': 15}, '73': {'count': 23}, '74': {'count': 4}, '75': {'count': 27}, '76': {'count': 52}, '77': {'count': 29}, '78': {'count': 19}, '79': {'count': 10}, '80': {'count': 33}, '81': {'count': 9}, '82': {'count': 54}, '83': {'count': 27}, '84': {'count': 5}, '85': {'count': 34}, '86': {'count': 15}, '87': {'count': 56}, '88': {'count': 29}, '89': {'count': 34}, '90': {'count': 5}, '91': {'count': 55}, '92': {'count': 19}, '93': {'count': 56}, '94': {'count': 45}, '95': {'count': 209}, '96': {'count': 7}, '97': {'count': 29}, '98': {'count': 4}, '99': {'count': 26}, '100': {'count': 9}, '101': {'count': 30}, '21': {'count': 17}}}} | | [CanadaTaxCourtOutcomesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [CataloniaTweetClassification](https://aclanthology.org/2020.lrec-1.171/) (Zotova et al., 2020) | ['cat', 'spa'] | Classification | s2s | [Government, Social, Written] | None | None | +| [ChatDoctorRetrieval](https://huggingface.co/datasets/embedding-benchmark/ChatDoctor_HealthCareMagic) | ['eng'] | Retrieval | s2p | [Medical] | {'test': 11136} | {'test': {'number_of_characters': 2389149, 'num_samples': 11136, 'num_queries': 5591, 'num_documents': 5545, 'min_document_length': 12, 'average_document_length': 428.87, 'max_document_length': 4846, 'unique_documents': 5545, 'min_query_length': 2, 'average_query_length': 1.98, 'max_query_length': 2, 'unique_queries': 5591, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 5545}} | | [ChemHotpotQARetrieval](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Retrieval | s2p | [Chemistry] | None | None | | [ChemNQRetrieval](https://arxiv.org/abs/2412.00532) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | [Chemistry] | None | None | | [ClimateFEVER](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [ClimateFEVER-Fa](https://huggingface.co/datasets/MCINext/climate-fever-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [ClimateFEVER-NL](https://huggingface.co/datasets/clips/beir-nl-climate-fever) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [ClimateFEVER-VN](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [ClimateFEVER.v2](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Academic, Written] | None | None | | [ClimateFEVERHardNegatives](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [ClusTREC-Covid](https://github.com/katzurik/Knowledge_Navigator/tree/main/Benchmarks/CLUSTREC%20COVID) (Katz et al., 2024) | ['eng'] | Clustering | p2p | [Academic, Medical, Written] | {'test': 4568} | {'test': {'num_samples': 4568, 'number_of_characters': 2977845, 'min_text_length': 14, 'average_text_length': 651.89, 'max_text_length': 8364, 'min_labels_per_text': 6, 'average_labels_per_text': 1.0, 'max_labels_per_text': 100, 'unique_labels': 50, 'labels': {'coronavirus origin': {'count': 100}, 'coronavirus response to weather changes': {'count': 100}, 'coronavirus immunity': {'count': 78}, 'how do people die from the coronavirus': {'count': 100}, 'animal models of COVID-19': {'count': 100}, 'coronavirus test rapid testing': {'count': 100}, 'serological tests for coronavirus': {'count': 100}, 'coronavirus under reporting': {'count': 100}, 'coronavirus in Canada': {'count': 92}, 'coronavirus social distancing impact': {'count': 100}, 'coronavirus hospital rationing': {'count': 100}, 'coronavirus quarantine': {'count': 100}, 'how does coronavirus spread': {'count': 100}, 'coronavirus super spreaders': {'count': 98}, 'coronavirus outside body': {'count': 34}, 'how long does coronavirus survive on surfaces': {'count': 74}, 'coronavirus clinical trials': {'count': 100}, 'masks prevent coronavirus': {'count': 100}, 'what alcohol sanitizer kills coronavirus': {'count': 64}, 'coronavirus and ACE inhibitors': {'count': 100}, 'coronavirus mortality': {'count': 100}, 'coronavirus heart impacts': {'count': 100}, 'coronavirus hypertension': {'count': 74}, 'coronavirus diabetes': {'count': 100}, 'coronavirus biomarkers': {'count': 100}, 'coronavirus early symptoms': {'count': 100}, 'coronavirus asymptomatic': {'count': 100}, 'coronavirus hydroxychloroquine': {'count': 100}, 'coronavirus drug repurposing': {'count': 100}, 'coronavirus remdesivir': {'count': 100}, 'difference between coronavirus and flu': {'count': 100}, 'coronavirus subtypes': {'count': 6}, 'coronavirus vaccine candidates': {'count': 36}, 'coronavirus recovery': {'count': 100}, 'coronavirus public datasets': {'count': 100}, 'SARS-CoV-2 spike structure': {'count': 100}, 'SARS-CoV-2 phylogenetic analysis': {'count': 100}, 'COVID inflammatory response': {'count': 100}, 'COVID-19 cytokine storm': {'count': 100}, 'coronavirus mutations': {'count': 100}, 'COVID-19 in African-Americans': {'count': 100}, 'Vitamin D and COVID-19': {'count': 100}, 'violence during pandemic': {'count': 100}, 'impact of masks on coronavirus transmission': {'count': 100}, 'coronavirus mental health impact': {'count': 100}, 'dexamethasone coronavirus': {'count': 92}, 'COVID-19 outcomes in children': {'count': 100}, 'school reopening coronavirus': {'count': 100}, 'post-infection COVID-19 immunity': {'count': 88}, 'mRNA vaccine coronavirus': {'count': 32}}, 'hf_subset_descriptive_stats': {'title and abstract': {'num_samples': 2284, 'number_of_characters': 2755462, 'min_text_length': 14, 'average_text_length': 1206.42, 'max_text_length': 8364, 'min_labels_per_text': 3, 'average_labels_per_text': 1.0, 'max_labels_per_text': 50, 'unique_labels': 50, 'labels': {'coronavirus origin': {'count': 50}, 'coronavirus response to weather changes': {'count': 50}, 'coronavirus immunity': {'count': 39}, 'how do people die from the coronavirus': {'count': 50}, 'animal models of COVID-19': {'count': 50}, 'coronavirus test rapid testing': {'count': 50}, 'serological tests for coronavirus': {'count': 50}, 'coronavirus under reporting': {'count': 50}, 'coronavirus in Canada': {'count': 46}, 'coronavirus social distancing impact': {'count': 50}, 'coronavirus hospital rationing': {'count': 50}, 'coronavirus quarantine': {'count': 50}, 'how does coronavirus spread': {'count': 50}, 'coronavirus super spreaders': {'count': 49}, 'coronavirus outside body': {'count': 17}, 'how long does coronavirus survive on surfaces': {'count': 37}, 'coronavirus clinical trials': {'count': 50}, 'masks prevent coronavirus': {'count': 50}, 'what alcohol sanitizer kills coronavirus': {'count': 32}, 'coronavirus and ACE inhibitors': {'count': 50}, 'coronavirus mortality': {'count': 50}, 'coronavirus heart impacts': {'count': 50}, 'coronavirus hypertension': {'count': 37}, 'coronavirus diabetes': {'count': 50}, 'coronavirus biomarkers': {'count': 50}, 'coronavirus early symptoms': {'count': 50}, 'coronavirus asymptomatic': {'count': 50}, 'coronavirus hydroxychloroquine': {'count': 50}, 'coronavirus drug repurposing': {'count': 50}, 'coronavirus remdesivir': {'count': 50}, 'difference between coronavirus and flu': {'count': 50}, 'coronavirus subtypes': {'count': 3}, 'coronavirus vaccine candidates': {'count': 18}, 'coronavirus recovery': {'count': 50}, 'coronavirus public datasets': {'count': 50}, 'SARS-CoV-2 spike structure': {'count': 50}, 'SARS-CoV-2 phylogenetic analysis': {'count': 50}, 'COVID inflammatory response': {'count': 50}, 'COVID-19 cytokine storm': {'count': 50}, 'coronavirus mutations': {'count': 50}, 'COVID-19 in African-Americans': {'count': 50}, 'Vitamin D and COVID-19': {'count': 50}, 'violence during pandemic': {'count': 50}, 'impact of masks on coronavirus transmission': {'count': 50}, 'coronavirus mental health impact': {'count': 50}, 'dexamethasone coronavirus': {'count': 46}, 'COVID-19 outcomes in children': {'count': 50}, 'school reopening coronavirus': {'count': 50}, 'post-infection COVID-19 immunity': {'count': 44}, 'mRNA vaccine coronavirus': {'count': 16}}}, 'title': {'num_samples': 2284, 'number_of_characters': 222383, 'min_text_length': 14, 'average_text_length': 97.37, 'max_text_length': 348, 'min_labels_per_text': 3, 'average_labels_per_text': 1.0, 'max_labels_per_text': 50, 'unique_labels': 50, 'labels': {'coronavirus origin': {'count': 50}, 'coronavirus response to weather changes': {'count': 50}, 'coronavirus immunity': {'count': 39}, 'how do people die from the coronavirus': {'count': 50}, 'animal models of COVID-19': {'count': 50}, 'coronavirus test rapid testing': {'count': 50}, 'serological tests for coronavirus': {'count': 50}, 'coronavirus under reporting': {'count': 50}, 'coronavirus in Canada': {'count': 46}, 'coronavirus social distancing impact': {'count': 50}, 'coronavirus hospital rationing': {'count': 50}, 'coronavirus quarantine': {'count': 50}, 'how does coronavirus spread': {'count': 50}, 'coronavirus super spreaders': {'count': 49}, 'coronavirus outside body': {'count': 17}, 'how long does coronavirus survive on surfaces': {'count': 37}, 'coronavirus clinical trials': {'count': 50}, 'masks prevent coronavirus': {'count': 50}, 'what alcohol sanitizer kills coronavirus': {'count': 32}, 'coronavirus and ACE inhibitors': {'count': 50}, 'coronavirus mortality': {'count': 50}, 'coronavirus heart impacts': {'count': 50}, 'coronavirus hypertension': {'count': 37}, 'coronavirus diabetes': {'count': 50}, 'coronavirus biomarkers': {'count': 50}, 'coronavirus early symptoms': {'count': 50}, 'coronavirus asymptomatic': {'count': 50}, 'coronavirus hydroxychloroquine': {'count': 50}, 'coronavirus drug repurposing': {'count': 50}, 'coronavirus remdesivir': {'count': 50}, 'difference between coronavirus and flu': {'count': 50}, 'coronavirus subtypes': {'count': 3}, 'coronavirus vaccine candidates': {'count': 18}, 'coronavirus recovery': {'count': 50}, 'coronavirus public datasets': {'count': 50}, 'SARS-CoV-2 spike structure': {'count': 50}, 'SARS-CoV-2 phylogenetic analysis': {'count': 50}, 'COVID inflammatory response': {'count': 50}, 'COVID-19 cytokine storm': {'count': 50}, 'coronavirus mutations': {'count': 50}, 'COVID-19 in African-Americans': {'count': 50}, 'Vitamin D and COVID-19': {'count': 50}, 'violence during pandemic': {'count': 50}, 'impact of masks on coronavirus transmission': {'count': 50}, 'coronavirus mental health impact': {'count': 50}, 'dexamethasone coronavirus': {'count': 46}, 'COVID-19 outcomes in children': {'count': 50}, 'school reopening coronavirus': {'count': 50}, 'post-infection COVID-19 immunity': {'count': 44}, 'mRNA vaccine coronavirus': {'count': 16}}}}}} | @@ -229,25 +251,28 @@ The following tables give you an overview of the tasks in MTEB. | [CrossLingualSemanticDiscriminationWMT19](https://huggingface.co/datasets/Andrianos/clsd_wmt19_21) | ['deu', 'fra'] | Retrieval | s2s | [News, Written] | None | None | | [CrossLingualSemanticDiscriminationWMT21](https://huggingface.co/datasets/Andrianos/clsd_wmt19_21) | ['deu', 'fra'] | Retrieval | s2s | [News, Written] | None | None | | [CyrillicTurkicLangClassification](https://huggingface.co/datasets/tatiana-merz/cyrillic_turkic_langs) (Goldhahn et al., 2012) | ['bak', 'chv', 'kaz', 'kir', 'krc', 'rus', 'sah', 'tat', 'tyv'] | Classification | s2s | [Web, Written] | None | None | -| [CzechProductReviewSentimentClassification](https://aclanthology.org/W13-1609/) (Habernal et al., 2013) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | -| [CzechSoMeSentimentClassification](https://aclanthology.org/W13-1609/) (Habernal et al., 2013) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | +| [CzechProductReviewSentimentClassification.v2](https://aclanthology.org/W13-1609/) (Habernal et al., 2013) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | +| [CzechSoMeSentimentClassification.v2](https://aclanthology.org/W13-1609/) (Habernal et al., 2013) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | | [CzechSubjectivityClassification](https://arxiv.org/abs/2009.08712) (P{\v{r, 2022) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | | [DBPedia](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [DBPedia-Fa](https://huggingface.co/datasets/MCINext/dbpedia-fa) | ['fas'] | Retrieval | s2p | [Encyclopaedic] | None | None | | [DBPedia-NL](https://huggingface.co/datasets/clips/beir-nl-dbpedia-entity) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [DBPedia-PL](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['pol'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [DBPedia-PLHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['pol'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [DBPedia-VN](https://github.com/iai-group/DBpedia-Entity/) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [DBPediaHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | -| [DBpediaClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Encyclopaedic, Written] | None | None | -| [DKHateClassification](https://aclanthology.org/2020.lrec-1.430/) (Sigurbergsson et al., 2020) | ['dan'] | Classification | s2s | [Social, Written] | None | None | +| [DBpediaClassification.v2](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Encyclopaedic, Written] | None | None | +| [DKHateClassification.v2](https://aclanthology.org/2020.lrec-1.430/) (Sigurbergsson et al., 2020) | ['dan'] | Classification | s2s | [Social, Written] | None | None | +| [DS1000Retrieval](https://huggingface.co/datasets/embedding-benchmark/DS1000) (Lai et al., 2022) | ['eng', 'python'] | Retrieval | s2s | [Programming] | {'test': 2000} | {'test': {'number_of_characters': 88032, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 47, 'average_document_length': 86.03, 'max_document_length': 235, 'unique_documents': 1000, 'min_query_length': 2, 'average_query_length': 2.0, 'max_query_length': 2, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}} | | [DTD](https://www.robots.ox.ac.uk/~vgg/data/dtd/) (M. Cimpoi, 2014) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 1880} | {'test': {'num_samples': 1880, 'unique_num_labels': 47, 'min_image_width': 300, 'average_image_width': 488.98, 'max_image_width': 900, 'min_image_height': 300, 'average_image_height': 447.5, 'max_image_height': 778, 'labels': {'0': {'count': 40}, '1': {'count': 40}, '10': {'count': 40}, '11': {'count': 40}, '12': {'count': 40}, '13': {'count': 40}, '14': {'count': 40}, '15': {'count': 40}, '16': {'count': 40}, '17': {'count': 40}, '18': {'count': 40}, '19': {'count': 40}, '2': {'count': 40}, '20': {'count': 40}, '21': {'count': 40}, '22': {'count': 40}, '23': {'count': 40}, '24': {'count': 40}, '25': {'count': 40}, '26': {'count': 40}, '27': {'count': 40}, '28': {'count': 40}, '29': {'count': 40}, '3': {'count': 40}, '30': {'count': 40}, '31': {'count': 40}, '32': {'count': 40}, '33': {'count': 40}, '34': {'count': 40}, '35': {'count': 40}, '36': {'count': 40}, '37': {'count': 40}, '38': {'count': 40}, '39': {'count': 40}, '4': {'count': 40}, '40': {'count': 40}, '41': {'count': 40}, '42': {'count': 40}, '43': {'count': 40}, '44': {'count': 40}, '45': {'count': 40}, '46': {'count': 40}, '5': {'count': 40}, '6': {'count': 40}, '7': {'count': 40}, '8': {'count': 40}, '9': {'count': 40}}}} | | [DTDZeroShot](https://www.robots.ox.ac.uk/~vgg/data/dtd/) (M. Cimpoi, 2014) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | {'test': 1880} | {'test': {'num_samples': 1880, 'unique_num_labels': 47, 'min_image_width': 300, 'average_image_width': 488.98, 'max_image_width': 900, 'min_image_height': 300, 'average_image_height': 447.5, 'max_image_height': 778, 'min_label_text_length': 24, 'average_label_text_length': 27.38, 'max_label_text_length': 32, 'labels': {'0': {'count': 40}, '1': {'count': 40}, '10': {'count': 40}, '11': {'count': 40}, '12': {'count': 40}, '13': {'count': 40}, '14': {'count': 40}, '15': {'count': 40}, '16': {'count': 40}, '17': {'count': 40}, '18': {'count': 40}, '19': {'count': 40}, '2': {'count': 40}, '20': {'count': 40}, '21': {'count': 40}, '22': {'count': 40}, '23': {'count': 40}, '24': {'count': 40}, '25': {'count': 40}, '26': {'count': 40}, '27': {'count': 40}, '28': {'count': 40}, '29': {'count': 40}, '3': {'count': 40}, '30': {'count': 40}, '31': {'count': 40}, '32': {'count': 40}, '33': {'count': 40}, '34': {'count': 40}, '35': {'count': 40}, '36': {'count': 40}, '37': {'count': 40}, '38': {'count': 40}, '39': {'count': 40}, '4': {'count': 40}, '40': {'count': 40}, '41': {'count': 40}, '42': {'count': 40}, '43': {'count': 40}, '44': {'count': 40}, '45': {'count': 40}, '46': {'count': 40}, '5': {'count': 40}, '6': {'count': 40}, '7': {'count': 40}, '8': {'count': 40}, '9': {'count': 40}}}} | | [DadoEvalCoarseClassification](https://github.com/dhfbk/DaDoEval) (Menini et al., 2020) | ['ita'] | Classification | s2s | [Written] | None | None | -| [DalajClassification](https://spraakbanken.gu.se/en/resources/superlim) (Elena Volodina, 2021) | ['swe'] | Classification | s2s | [Non-fiction, Written] | None | None | +| [DalajClassification.v2](https://spraakbanken.gu.se/en/resources/superlim) (Elena Volodina, 2021) | ['swe'] | Classification | s2s | [Non-fiction, Written] | None | None | | [DanFeverRetrieval](https://aclanthology.org/2021.nodalida-main.47/) (N{\o, 2021) | ['dan'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Spoken] | None | None | | [DanishMedicinesAgencyBitextMining](https://sprogteknologi.dk/dataset/bilingual-english-danish-parallel-corpus-from-the-danish-medicines-agency) (Rozis et al., 2019) | ['dan', 'eng'] | BitextMining | s2s | [Medical, Written] | None | None | -| [DanishPoliticalCommentsClassification](https://huggingface.co/datasets/danish_political_comments) (Mads Guldborg Kjeldgaard Kongsbak, 2019) | ['dan'] | Classification | s2s | [Social, Written] | None | None | -| [DeepSentiPers](https://github.com/JoyeBright/DeepSentiPers) | ['fas'] | Classification | s2s | [Reviews] | None | None | +| [DanishPoliticalCommentsClassification.v2](https://huggingface.co/datasets/danish_political_comments) (Mads Guldborg Kjeldgaard Kongsbak, 2019) | ['dan'] | Classification | s2s | [Social, Written] | None | None | +| [Ddisco.v2](https://aclanthology.org/2022.lrec-1.260/) (Flansmose Mikkelsen et al., 2022) | ['dan'] | Classification | s2s | [Non-fiction, Social, Written] | None | None | +| [DeepSentiPers.v2](https://github.com/JoyeBright/DeepSentiPers) | ['fas'] | Classification | s2s | [Reviews] | None | None | | [DefinitionClassificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [DiaBlaBitextMining](https://inria.hal.science/hal-03021633) (González et al., 2019) | ['eng', 'fra'] | BitextMining | s2s | [Social, Written] | None | None | | [DigikalamagClassification](https://hooshvare.github.io/docs/datasets/tc) | ['fas'] | Classification | p2p | [Web] | None | None | @@ -260,22 +285,24 @@ The following tables give you an overview of the tasks in MTEB. | [Diversity5LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [Diversity6LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [DuRetrieval](https://aclanthology.org/2022.emnlp-main.357.pdf) (Yifu Qiu, 2022) | ['cmn'] | Retrieval | s2p | | None | None | -| [DutchBookReviewSentimentClassification](https://github.com/benjaminvdb/DBRD) (Benjamin et al., 2019) | ['nld'] | Classification | s2s | [Reviews, Written] | None | None | +| [DutchBookReviewSentimentClassification.v2](https://github.com/benjaminvdb/DBRD) (Benjamin et al., 2019) | ['nld'] | Classification | s2s | [Reviews, Written] | None | None | | [EDIST2ITRetrieval](https://aclanthology.org/2023.emnlp-main.297/) (Liu et al., 2023) | ['eng'] | Any2AnyRetrieval | t2it | [News] | {'test': 1050308} | {'test': {'number_of_characters': 97112347, 'num_samples': 1050308, 'num_queries': 3241, 'num_documents': 1047067, 'min_document_length': 3, 'average_document_length': 92.37, 'max_document_length': 264, 'unique_documents': 553171, 'num_document_images': 1047067, 'min_query_length': 30, 'average_query_length': 120.33, 'max_query_length': 340, 'unique_queries': 3241, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 2.57, 'max_relevant_docs_per_query': 8, 'unique_relevant_docs': 8324}} | | [ESCIReranking](https://github.com/amazon-science/esci-data/) (Chandan K. Reddy, 2022) | ['eng', 'jpn', 'spa'] | Reranking | s2p | [Written] | {'test': 29285} | {'test': {'num_samples': 29285, 'number_of_characters': 254538331, 'num_positive': 271416, 'num_negative': 44235, 'min_query_length': 1, 'avg_query_length': 19.69, 'max_query_length': 151, 'unique_query': 29269, 'min_positive_length': 1, 'avg_positive_length': 803.92, 'max_positive_length': 8640, 'unique_positive': 217712, 'min_negative_length': 1, 'avg_negative_length': 808.5, 'max_negative_length': 4441, 'unique_negative': 39551, 'hf_subset_descriptive_stats': {'us': {'num_samples': 21296, 'number_of_characters': 186915609, 'num_positive': 189375, 'num_negative': 25463, 'min_query_length': 1, 'avg_query_length': 21.44, 'max_query_length': 151, 'unique_query': 21296, 'min_positive_length': 1, 'avg_positive_length': 868.37, 'max_positive_length': 5545, 'unique_positive': 150734, 'min_negative_length': 1, 'avg_negative_length': 864.45, 'max_negative_length': 3779, 'unique_negative': 23073}, 'es': {'num_samples': 3703, 'number_of_characters': 48861389, 'num_positive': 39110, 'num_negative': 10183, 'min_query_length': 3, 'avg_query_length': 20.68, 'max_query_length': 59, 'unique_query': 3703, 'min_positive_length': 1, 'avg_positive_length': 980.96, 'max_positive_length': 8640, 'unique_positive': 32921, 'min_negative_length': 1, 'avg_negative_length': 1023.22, 'max_negative_length': 4441, 'unique_negative': 9285}, 'jp': {'num_samples': 4286, 'number_of_characters': 18761333, 'num_positive': 42931, 'num_negative': 8589, 'min_query_length': 1, 'avg_query_length': 10.15, 'max_query_length': 60, 'unique_query': 4286, 'min_positive_length': 1, 'avg_positive_length': 358.36, 'max_positive_length': 3488, 'unique_positive': 35165, 'min_negative_length': 1, 'avg_negative_length': 388.08, 'max_negative_length': 3940, 'unique_negative': 7289}}}} | | [EcomRetrieval](https://arxiv.org/abs/2203.03367) (Dingkun Long, 2022) | ['cmn'] | Retrieval | s2p | | None | None | | [EightTagsClustering.v2](https://aclanthology.org/2020.lrec-1.207.pdf) (Dadas et al., 2020) | ['pol'] | Clustering | s2s | [Social, Written] | None | None | | [EmitClassification](https://github.com/oaraque/emit) (Araque et al., 2023) | ['ita'] | MultilabelClassification | s2s | [Social, Written] | None | None | -| [EmotionClassification](https://www.aclweb.org/anthology/D18-1404) (Saravia et al., 2018) | ['eng'] | Classification | s2s | [Social, Written] | None | None | +| [EmotionClassification.v2](https://www.aclweb.org/anthology/D18-1404) (Saravia et al., 2018) | ['eng'] | Classification | s2s | [Social, Written] | None | None | +| [EmotionVNClassification](https://www.aclweb.org/anthology/D18-1404) (Loc Pham, 2025) | ['vie'] | Classification | s2s | [Social, Written] | None | None | | [EncyclopediaVQAIT2ITRetrieval](https://github.com/google-research/google-research/tree/master/encyclopedic_vqa) (Mensink et al., 2023) | ['eng'] | Any2AnyRetrieval | it2it | [Encyclopaedic] | {'test': 72056} | {'test': {'number_of_characters': 88615743, 'num_samples': 72056, 'num_queries': 3743, 'num_documents': 68313, 'min_document_length': 24, 'average_document_length': 1294.37, 'max_document_length': 72928, 'unique_documents': 49186, 'num_document_images': 68313, 'min_query_length': 19, 'average_query_length': 51.7, 'max_query_length': 245, 'unique_queries': 2832, 'num_query_images': 3743, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.31, 'max_relevant_docs_per_query': 75, 'unique_relevant_docs': 2683}} | | [EstQA](https://www.semanticscholar.org/paper/Extractive-Question-Answering-for-Estonian-Language-182912IAPM-Alum%C3%A4e/ea4f60ab36cadca059c880678bc4c51e293a85d6?utm_source=direct_link) (Anu Käver, 2021) | ['est'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | -| [EstonianValenceClassification](https://figshare.com/articles/dataset/Estonian_Valence_Corpus_Eesti_valentsikorpus/24517054) (Hille Pajupuu, 2023) | ['est'] | Classification | s2s | [News, Written] | None | None | +| [EstonianValenceClassification.v2](https://figshare.com/articles/dataset/Estonian_Valence_Corpus_Eesti_valentsikorpus/24517054) (Hille Pajupuu, 2023) | ['est'] | Classification | s2s | [News, Written] | None | None | | [EuroSAT](https://ieeexplore.ieee.org/document/8736785) (Helber et al., 2019) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 5400} | {'test': {'num_samples': 5400, 'unique_num_labels': 10, 'min_image_width': 64, 'average_image_width': 64.0, 'max_image_width': 64, 'min_image_height': 64, 'average_image_height': 64.0, 'max_image_height': 64, 'labels': {'4': {'count': 501}, '3': {'count': 496}, '7': {'count': 554}, '2': {'count': 573}, '9': {'count': 609}, '0': {'count': 596}, '8': {'count': 529}, '1': {'count': 608}, '5': {'count': 396}, '6': {'count': 538}}}} | | [EuroSATZeroShot](https://ieeexplore.ieee.org/document/8736785) (Helber et al., 2019) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | {'test': 5400} | {'test': {'num_samples': 5400, 'unique_num_labels': 10, 'min_image_width': 64, 'average_image_width': 64.0, 'max_image_width': 64, 'min_image_height': 64, 'average_image_height': 64.0, 'max_image_height': 64, 'min_label_text_length': 36, 'average_label_text_length': 45.2, 'max_label_text_length': 53, 'labels': {'4': {'count': 501}, '3': {'count': 496}, '7': {'count': 554}, '2': {'count': 573}, '9': {'count': 609}, '0': {'count': 596}, '8': {'count': 529}, '1': {'count': 608}, '5': {'count': 396}, '6': {'count': 538}}}} | | [FER2013](https://arxiv.org/abs/1412.6572) (Ian J. Goodfellow, 2015) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 7178} | {'test': {'num_samples': 7178, 'unique_num_labels': 7, 'min_image_width': 48, 'average_image_width': 48.0, 'max_image_width': 48, 'min_image_height': 48, 'average_image_height': 48.0, 'max_image_height': 48, 'labels': {'0': {'count': 958}, '1': {'count': 111}, '2': {'count': 1024}, '3': {'count': 1774}, '4': {'count': 1233}, '5': {'count': 1247}, '6': {'count': 831}}}} | | [FER2013ZeroShot](https://arxiv.org/abs/1412.6572) (Ian J. Goodfellow, 2015) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | {'test': 7178} | {'test': {'num_samples': 7178, 'unique_num_labels': 7, 'min_image_width': 48, 'average_image_width': 48.0, 'max_image_width': 48, 'min_image_height': 48, 'average_image_height': 48.0, 'max_image_height': 48, 'min_label_text_length': 30, 'average_label_text_length': 32.57, 'max_label_text_length': 35, 'labels': {'0': {'count': 958}, '1': {'count': 111}, '2': {'count': 1024}, '3': {'count': 1774}, '4': {'count': 1233}, '5': {'count': 1247}, '6': {'count': 831}}}} | | [FEVER](https://fever.ai/) (Thorne et al., 2018) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [FEVER-NL](https://huggingface.co/datasets/clips/beir-nl-fever) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [FEVER-VN](https://fever.ai/) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [FEVERHardNegatives](https://fever.ai/) (Thorne et al., 2018) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [FGVCAircraft](https://arxiv.org/abs/1306.5151) (Subhransu Maji, 2013) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 3333} | {'test': {'num_samples': 3333, 'unique_num_labels': 100, 'min_image_width': 800, 'average_image_width': 1098.58, 'max_image_width': 1600, 'min_image_height': 413, 'average_image_height': 747.0, 'max_image_height': 1197, 'labels': {'0': {'count': 33}, '1': {'count': 33}, '2': {'count': 34}, '3': {'count': 33}, '4': {'count': 33}, '5': {'count': 34}, '6': {'count': 33}, '7': {'count': 33}, '8': {'count': 34}, '9': {'count': 33}, '10': {'count': 33}, '11': {'count': 34}, '12': {'count': 33}, '13': {'count': 33}, '14': {'count': 34}, '15': {'count': 33}, '16': {'count': 33}, '17': {'count': 34}, '18': {'count': 33}, '19': {'count': 33}, '20': {'count': 34}, '21': {'count': 33}, '22': {'count': 33}, '23': {'count': 34}, '24': {'count': 33}, '25': {'count': 33}, '26': {'count': 34}, '27': {'count': 33}, '28': {'count': 33}, '29': {'count': 34}, '30': {'count': 33}, '31': {'count': 33}, '32': {'count': 34}, '33': {'count': 33}, '34': {'count': 33}, '35': {'count': 34}, '36': {'count': 33}, '37': {'count': 33}, '38': {'count': 34}, '39': {'count': 33}, '40': {'count': 33}, '41': {'count': 34}, '42': {'count': 33}, '43': {'count': 33}, '44': {'count': 34}, '45': {'count': 33}, '46': {'count': 33}, '47': {'count': 34}, '48': {'count': 33}, '49': {'count': 33}, '50': {'count': 34}, '51': {'count': 33}, '52': {'count': 33}, '53': {'count': 34}, '54': {'count': 33}, '55': {'count': 33}, '56': {'count': 34}, '57': {'count': 33}, '58': {'count': 33}, '59': {'count': 34}, '60': {'count': 33}, '61': {'count': 33}, '62': {'count': 34}, '63': {'count': 33}, '64': {'count': 33}, '65': {'count': 34}, '66': {'count': 33}, '67': {'count': 33}, '68': {'count': 34}, '69': {'count': 33}, '70': {'count': 33}, '71': {'count': 34}, '72': {'count': 33}, '73': {'count': 33}, '74': {'count': 34}, '75': {'count': 33}, '76': {'count': 33}, '77': {'count': 34}, '78': {'count': 33}, '79': {'count': 33}, '80': {'count': 34}, '81': {'count': 33}, '82': {'count': 33}, '83': {'count': 34}, '84': {'count': 33}, '85': {'count': 33}, '86': {'count': 34}, '87': {'count': 33}, '88': {'count': 33}, '89': {'count': 34}, '90': {'count': 33}, '91': {'count': 33}, '92': {'count': 34}, '93': {'count': 33}, '94': {'count': 33}, '95': {'count': 34}, '96': {'count': 33}, '97': {'count': 33}, '98': {'count': 34}, '99': {'count': 33}}}} | | [FGVCAircraftZeroShot](https://arxiv.org/abs/1306.5151) (Subhransu Maji, 2013) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | {'test': 3333} | {'test': {'num_samples': 3333, 'unique_num_labels': 100, 'min_image_width': 800, 'average_image_width': 1098.58, 'max_image_width': 1600, 'min_image_height': 413, 'average_image_height': 747.0, 'max_image_height': 1197, 'min_label_text_length': 38, 'average_label_text_length': 41.46, 'max_label_text_length': 53, 'labels': {'0': {'count': 33}, '1': {'count': 33}, '2': {'count': 34}, '3': {'count': 33}, '4': {'count': 33}, '5': {'count': 34}, '6': {'count': 33}, '7': {'count': 33}, '8': {'count': 34}, '9': {'count': 33}, '10': {'count': 33}, '11': {'count': 34}, '12': {'count': 33}, '13': {'count': 33}, '14': {'count': 34}, '15': {'count': 33}, '16': {'count': 33}, '17': {'count': 34}, '18': {'count': 33}, '19': {'count': 33}, '20': {'count': 34}, '21': {'count': 33}, '22': {'count': 33}, '23': {'count': 34}, '24': {'count': 33}, '25': {'count': 33}, '26': {'count': 34}, '27': {'count': 33}, '28': {'count': 33}, '29': {'count': 34}, '30': {'count': 33}, '31': {'count': 33}, '32': {'count': 34}, '33': {'count': 33}, '34': {'count': 33}, '35': {'count': 34}, '36': {'count': 33}, '37': {'count': 33}, '38': {'count': 34}, '39': {'count': 33}, '40': {'count': 33}, '41': {'count': 34}, '42': {'count': 33}, '43': {'count': 33}, '44': {'count': 34}, '45': {'count': 33}, '46': {'count': 33}, '47': {'count': 34}, '48': {'count': 33}, '49': {'count': 33}, '50': {'count': 34}, '51': {'count': 33}, '52': {'count': 33}, '53': {'count': 34}, '54': {'count': 33}, '55': {'count': 33}, '56': {'count': 34}, '57': {'count': 33}, '58': {'count': 33}, '59': {'count': 34}, '60': {'count': 33}, '61': {'count': 33}, '62': {'count': 34}, '63': {'count': 33}, '64': {'count': 33}, '65': {'count': 34}, '66': {'count': 33}, '67': {'count': 33}, '68': {'count': 34}, '69': {'count': 33}, '70': {'count': 33}, '71': {'count': 34}, '72': {'count': 33}, '73': {'count': 33}, '74': {'count': 34}, '75': {'count': 33}, '76': {'count': 33}, '77': {'count': 34}, '78': {'count': 33}, '79': {'count': 33}, '80': {'count': 34}, '81': {'count': 33}, '82': {'count': 33}, '83': {'count': 34}, '84': {'count': 33}, '85': {'count': 33}, '86': {'count': 34}, '87': {'count': 33}, '88': {'count': 33}, '89': {'count': 34}, '90': {'count': 33}, '91': {'count': 33}, '92': {'count': 34}, '93': {'count': 33}, '94': {'count': 33}, '95': {'count': 34}, '96': {'count': 33}, '97': {'count': 33}, '98': {'count': 34}, '99': {'count': 33}}}} | @@ -295,60 +322,68 @@ The following tables give you an overview of the tasks in MTEB. | [FiQA2018](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['eng'] | Retrieval | s2p | [Financial, Written] | None | None | | [FiQA2018-Fa](https://huggingface.co/datasets/MCINext/fiqa-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [FiQA2018-NL](https://huggingface.co/datasets/clips/beir-nl-fiqa) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Non-fiction, Written] | None | None | -| [FilipinoHateSpeechClassification](https://pcj.csp.org.ph/index.php/pcj/issue/download/29/PCJ%20V14%20N1%20pp1-14%202019) (Neil Vicente Cabasag et al., 2019) | ['fil'] | Classification | s2s | [Social, Written] | None | None | +| [FiQA2018-VN](https://sites.google.com/view/fiqa/) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Financial, Written] | None | None | +| [FilipinoHateSpeechClassification.v2](https://pcj.csp.org.ph/index.php/pcj/issue/download/29/PCJ%20V14%20N1%20pp1-14%202019) (Neil Vicente Cabasag et al., 2019) | ['fil'] | Classification | s2s | [Social, Written] | None | None | | [FilipinoShopeeReviewsClassification](https://uijrt.com/articles/v4/i8/UIJRTV4I80009.pdf) | ['fil'] | Classification | s2s | [Social, Written] | None | None | | [FinParaSTS](https://huggingface.co/datasets/TurkuNLP/turku_paraphrase_corpus) (Kanerva et al., 2021) | ['fin'] | STS | s2s | [News, Subtitles, Written] | None | None | -| [FinToxicityClassification](https://aclanthology.org/2023.nodalida-1.68) (Eskelinen et al., 2023) | ['fin'] | Classification | s2s | [News, Written] | None | None | -| [FinancialPhrasebankClassification](https://arxiv.org/abs/1307.5336) (P. Malo, 2014) | ['eng'] | Classification | s2s | [Financial, News, Written] | None | None | +| [FinQARetrieval](https://huggingface.co/datasets/embedding-benchmark/FinQA) (Chen et al., 2021) | ['eng'] | Retrieval | s2p | [Financial] | {'test': 1800} | {'test': {'number_of_characters': 165000, 'num_samples': 1800, 'num_queries': 900, 'num_documents': 900, 'min_document_length': 75, 'average_document_length': 183.33, 'max_document_length': 600, 'unique_documents': 900, 'min_query_length': 3, 'average_query_length': 3.0, 'max_query_length': 3, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 900}} | +| [FinToxicityClassification.v2](https://aclanthology.org/2023.nodalida-1.68) (Eskelinen et al., 2023) | ['fin'] | Classification | s2s | [News, Written] | None | None | +| [FinanceBenchRetrieval](https://huggingface.co/datasets/embedding-benchmark/FinanceBench) (Islam et al., 2023) | ['eng'] | Retrieval | s2p | [Financial] | {'test': 1500} | {'test': {'number_of_characters': 180000, 'num_samples': 1500, 'num_queries': 750, 'num_documents': 750, 'min_document_length': 80, 'average_document_length': 240.0, 'max_document_length': 800, 'unique_documents': 750, 'min_query_length': 4, 'average_query_length': 4.0, 'max_query_length': 4, 'unique_queries': 750, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 750}} | +| [FinancialPhrasebankClassification.v2](https://arxiv.org/abs/1307.5336) (P. Malo, 2014) | ['eng'] | Classification | s2s | [Financial, News, Written] | None | None | | [Flickr30kI2TRetrieval](https://www.semanticscholar.org/paper/From-image-descriptions-to-visual-denotations%3A-New-Young-Lai/44040913380206991b1991daf1192942e038fe31) (Peter Young, 2014) | ['eng'] | Any2AnyRetrieval | i2t | [Web, Written] | {'test': 6000} | {'test': {'number_of_characters': 319250, 'num_samples': 6000, 'num_queries': 1000, 'num_documents': 5000, 'min_document_length': 11, 'average_document_length': 63.85, 'max_document_length': 375, 'unique_documents': 4999, 'num_document_images': 0, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 1000, 'min_relevant_docs_per_query': 5, 'average_relevant_docs_per_query': 5.0, 'max_relevant_docs_per_query': 5, 'unique_relevant_docs': 5000}} | | [Flickr30kT2IRetrieval](https://www.semanticscholar.org/paper/From-image-descriptions-to-visual-denotations%3A-New-Young-Lai/44040913380206991b1991daf1192942e038fe31) (Peter Young, 2014) | ['eng'] | Any2AnyRetrieval | t2i | [Web, Written] | {'test': 6000} | {'test': {'number_of_characters': 319250, 'num_samples': 6000, 'num_queries': 5000, 'num_documents': 1000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 1000, 'min_query_length': 11, 'average_query_length': 63.85, 'max_query_length': 375, 'unique_queries': 4999, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}} | | [FloresBitextMining](https://huggingface.co/datasets/facebook/flores) (Goyal et al., 2022) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | BitextMining | s2s | [Encyclopaedic, Non-fiction, Written] | None | None | | [Food101Classification](https://huggingface.co/datasets/ethz/food101) (Bossard et al., 2014) | ['eng'] | ImageClassification | i2i | [Web] | {'validation': 25250} | {'validation': {'num_samples': 25250, 'unique_num_labels': 101, 'min_image_width': 287, 'average_image_width': 495.82, 'max_image_width': 512, 'min_image_height': 213, 'average_image_height': 475.08, 'max_image_height': 512, 'labels': {'6': {'count': 250}, '79': {'count': 250}, '81': {'count': 250}, '53': {'count': 250}, '10': {'count': 250}, '20': {'count': 250}, '77': {'count': 250}, '48': {'count': 250}, '86': {'count': 250}, '84': {'count': 250}, '76': {'count': 250}, '34': {'count': 250}, '51': {'count': 250}, '21': {'count': 250}, '64': {'count': 250}, '0': {'count': 250}, '43': {'count': 250}, '44': {'count': 250}, '73': {'count': 250}, '57': {'count': 250}, '14': {'count': 250}, '5': {'count': 250}, '46': {'count': 250}, '55': {'count': 250}, '93': {'count': 250}, '98': {'count': 250}, '38': {'count': 250}, '11': {'count': 250}, '99': {'count': 250}, '72': {'count': 250}, '22': {'count': 250}, '59': {'count': 250}, '70': {'count': 250}, '16': {'count': 250}, '2': {'count': 250}, '58': {'count': 250}, '83': {'count': 250}, '96': {'count': 250}, '39': {'count': 250}, '49': {'count': 250}, '45': {'count': 250}, '88': {'count': 250}, '9': {'count': 250}, '26': {'count': 250}, '94': {'count': 250}, '4': {'count': 250}, '65': {'count': 250}, '32': {'count': 250}, '27': {'count': 250}, '36': {'count': 250}, '87': {'count': 250}, '69': {'count': 250}, '85': {'count': 250}, '25': {'count': 250}, '40': {'count': 250}, '19': {'count': 250}, '35': {'count': 250}, '56': {'count': 250}, '42': {'count': 250}, '60': {'count': 250}, '68': {'count': 250}, '100': {'count': 250}, '41': {'count': 250}, '92': {'count': 250}, '24': {'count': 250}, '3': {'count': 250}, '89': {'count': 250}, '75': {'count': 250}, '17': {'count': 250}, '97': {'count': 250}, '61': {'count': 250}, '33': {'count': 250}, '80': {'count': 250}, '30': {'count': 250}, '8': {'count': 250}, '74': {'count': 250}, '66': {'count': 250}, '31': {'count': 250}, '18': {'count': 250}, '67': {'count': 250}, '37': {'count': 250}, '13': {'count': 250}, '63': {'count': 250}, '28': {'count': 250}, '47': {'count': 250}, '52': {'count': 250}, '54': {'count': 250}, '1': {'count': 250}, '82': {'count': 250}, '91': {'count': 250}, '95': {'count': 250}, '7': {'count': 250}, '29': {'count': 250}, '78': {'count': 250}, '15': {'count': 250}, '23': {'count': 250}, '12': {'count': 250}, '62': {'count': 250}, '50': {'count': 250}, '71': {'count': 250}, '90': {'count': 250}}}} | | [Food101ZeroShot](https://huggingface.co/datasets/ethz/food101) (Bossard et al., 2014) | ['eng'] | ZeroShotClassification | i2t | [Web] | {'validation': 25250} | {'validation': {'num_samples': 25250, 'unique_num_labels': 101, 'min_image_width': 287, 'average_image_width': 495.82, 'max_image_width': 512, 'min_image_height': 213, 'average_image_height': 475.08, 'max_image_height': 512, 'min_label_text_length': 31, 'average_label_text_length': 38.72, 'max_label_text_length': 51, 'labels': {'6': {'count': 250}, '79': {'count': 250}, '81': {'count': 250}, '53': {'count': 250}, '10': {'count': 250}, '20': {'count': 250}, '77': {'count': 250}, '48': {'count': 250}, '86': {'count': 250}, '84': {'count': 250}, '76': {'count': 250}, '34': {'count': 250}, '51': {'count': 250}, '21': {'count': 250}, '64': {'count': 250}, '0': {'count': 250}, '43': {'count': 250}, '44': {'count': 250}, '73': {'count': 250}, '57': {'count': 250}, '14': {'count': 250}, '5': {'count': 250}, '46': {'count': 250}, '55': {'count': 250}, '93': {'count': 250}, '98': {'count': 250}, '38': {'count': 250}, '11': {'count': 250}, '99': {'count': 250}, '72': {'count': 250}, '22': {'count': 250}, '59': {'count': 250}, '70': {'count': 250}, '16': {'count': 250}, '2': {'count': 250}, '58': {'count': 250}, '83': {'count': 250}, '96': {'count': 250}, '39': {'count': 250}, '49': {'count': 250}, '45': {'count': 250}, '88': {'count': 250}, '9': {'count': 250}, '26': {'count': 250}, '94': {'count': 250}, '4': {'count': 250}, '65': {'count': 250}, '32': {'count': 250}, '27': {'count': 250}, '36': {'count': 250}, '87': {'count': 250}, '69': {'count': 250}, '85': {'count': 250}, '25': {'count': 250}, '40': {'count': 250}, '19': {'count': 250}, '35': {'count': 250}, '56': {'count': 250}, '42': {'count': 250}, '60': {'count': 250}, '68': {'count': 250}, '100': {'count': 250}, '41': {'count': 250}, '92': {'count': 250}, '24': {'count': 250}, '3': {'count': 250}, '89': {'count': 250}, '75': {'count': 250}, '17': {'count': 250}, '97': {'count': 250}, '61': {'count': 250}, '33': {'count': 250}, '80': {'count': 250}, '30': {'count': 250}, '8': {'count': 250}, '74': {'count': 250}, '66': {'count': 250}, '31': {'count': 250}, '18': {'count': 250}, '67': {'count': 250}, '37': {'count': 250}, '13': {'count': 250}, '63': {'count': 250}, '28': {'count': 250}, '47': {'count': 250}, '52': {'count': 250}, '54': {'count': 250}, '1': {'count': 250}, '82': {'count': 250}, '91': {'count': 250}, '95': {'count': 250}, '7': {'count': 250}, '29': {'count': 250}, '78': {'count': 250}, '15': {'count': 250}, '23': {'count': 250}, '12': {'count': 250}, '62': {'count': 250}, '50': {'count': 250}, '71': {'count': 250}, '90': {'count': 250}}}} | -| [FrenchBookReviews](https://huggingface.co/datasets/Abirate/french_book_reviews) | ['fra'] | Classification | s2s | [Reviews, Written] | None | None | -| [FrenkEnClassification](https://arxiv.org/abs/1906.02045) (Nikola Ljubešić, 2019) | ['eng'] | Classification | s2s | [Social, Written] | None | None | -| [FrenkHrClassification](https://arxiv.org/abs/1906.02045) (Nikola Ljubešić, 2019) | ['hrv'] | Classification | s2s | [Social, Written] | None | None | -| [FrenkSlClassification](https://arxiv.org/pdf/1906.02045) (Nikola Ljubešić, 2019) | ['slv'] | Classification | s2s | [Social, Written] | None | None | +| [FrenchBookReviews.v2](https://huggingface.co/datasets/Abirate/french_book_reviews) | ['fra'] | Classification | s2s | [Reviews, Written] | None | None | +| [FrenkEnClassification.v2](https://arxiv.org/abs/1906.02045) (Nikola Ljubešić, 2019) | ['eng'] | Classification | s2s | [Social, Written] | None | None | +| [FrenkHrClassification.v2](https://arxiv.org/abs/1906.02045) (Nikola Ljubešić, 2019) | ['hrv'] | Classification | s2s | [Social, Written] | None | None | +| [FrenkSlClassification.v2](https://arxiv.org/pdf/1906.02045) (Nikola Ljubešić, 2019) | ['slv'] | Classification | s2s | [Social, Written] | None | None | +| [FreshStackRetrieval](https://huggingface.co/datasets/embedding-benchmark/FreshStack_mteb) (FreshStack Authors, 2023) | ['eng', 'go', 'javascript', 'python'] | Retrieval | s2s | [Programming] | {'test': 3000} | {'test': {'number_of_characters': 125000, 'num_samples': 3000, 'num_queries': 1500, 'num_documents': 1500, 'min_document_length': 30, 'average_document_length': 83.33, 'max_document_length': 300, 'unique_documents': 1500, 'min_query_length': 2, 'average_query_length': 2.0, 'max_query_length': 2, 'unique_queries': 1500, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1500}} | | [FunctionOfDecisionSectionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [GLDv2I2IRetrieval](https://openaccess.thecvf.com/content_CVPR_2020/html/Weyand_Google_Landmarks_Dataset_v2_-_A_Large-Scale_Benchmark_for_Instance-Level_CVPR_2020_paper.html) (Weyand et al., 2020) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | {'test': 762886} | {'test': {'number_of_characters': 0, 'num_samples': 762886, 'num_queries': 1129, 'num_documents': 761757, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 761757, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 1129, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 13.41, 'max_relevant_docs_per_query': 168, 'unique_relevant_docs': 3081}} | | [GLDv2I2TRetrieval](https://openaccess.thecvf.com/content_CVPR_2020/html/Weyand_Google_Landmarks_Dataset_v2_-_A_Large-Scale_Benchmark_for_Instance-Level_CVPR_2020_paper.html) (Weyand et al., 2020) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | {'test': 2378} | {'test': {'number_of_characters': 14829, 'num_samples': 2378, 'num_queries': 1704, 'num_documents': 674, 'min_document_length': 4, 'average_document_length': 22.0, 'max_document_length': 70, 'unique_documents': 674, 'num_document_images': 0, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 1704, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.14, 'max_relevant_docs_per_query': 3, 'unique_relevant_docs': 674}} | | [GPUSpeedTask](https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/blob/c8376f967d1294419be1d3eb41217d04cd3a65d3/src/seb/registered_tasks/speed.py#L83-L96) | ['eng'] | Speed | s2s | [Fiction, Written] | None | None | | [GTSRB](https://benchmark.ini.rub.de/) (Stallkamp et al., 2011) | ['eng'] | ImageClassification | i2i | [Scene] | {'test': 12630} | {'test': {'num_samples': 12630, 'unique_num_labels': 43, 'min_image_width': 25, 'average_image_width': 50.51, 'max_image_width': 266, 'min_image_height': 25, 'average_image_height': 50.36, 'max_image_height': 232, 'labels': {'16': {'count': 150}, '1': {'count': 720}, '38': {'count': 690}, '33': {'count': 210}, '11': {'count': 420}, '18': {'count': 390}, '12': {'count': 690}, '25': {'count': 480}, '35': {'count': 390}, '7': {'count': 450}, '23': {'count': 150}, '4': {'count': 660}, '9': {'count': 480}, '21': {'count': 90}, '20': {'count': 90}, '27': {'count': 60}, '3': {'count': 450}, '13': {'count': 720}, '10': {'count': 660}, '5': {'count': 630}, '17': {'count': 360}, '34': {'count': 120}, '2': {'count': 750}, '8': {'count': 450}, '30': {'count': 150}, '24': {'count': 90}, '15': {'count': 210}, '26': {'count': 180}, '28': {'count': 150}, '22': {'count': 120}, '14': {'count': 270}, '32': {'count': 60}, '29': {'count': 90}, '6': {'count': 150}, '36': {'count': 120}, '40': {'count': 90}, '41': {'count': 60}, '31': {'count': 270}, '19': {'count': 60}, '0': {'count': 60}, '39': {'count': 90}, '42': {'count': 90}, '37': {'count': 60}}}} | | [GTSRBZeroShot](https://benchmark.ini.rub.de/) (Stallkamp et al., 2011) | ['eng'] | ZeroShotClassification | i2t | [Scene] | {'test': 12630} | {'test': {'num_samples': 12630, 'unique_num_labels': 43, 'min_image_width': 25, 'average_image_width': 50.51, 'max_image_width': 266, 'min_image_height': 25, 'average_image_height': 50.36, 'max_image_height': 232, 'min_label_text_length': 43, 'average_label_text_length': 88.23, 'max_label_text_length': 116, 'labels': {'16': {'count': 150}, '1': {'count': 720}, '38': {'count': 690}, '33': {'count': 210}, '11': {'count': 420}, '18': {'count': 390}, '12': {'count': 690}, '25': {'count': 480}, '35': {'count': 390}, '7': {'count': 450}, '23': {'count': 150}, '4': {'count': 660}, '9': {'count': 480}, '21': {'count': 90}, '20': {'count': 90}, '27': {'count': 60}, '3': {'count': 450}, '13': {'count': 720}, '10': {'count': 660}, '5': {'count': 630}, '17': {'count': 360}, '34': {'count': 120}, '2': {'count': 750}, '8': {'count': 450}, '30': {'count': 150}, '24': {'count': 90}, '15': {'count': 210}, '26': {'count': 180}, '28': {'count': 150}, '22': {'count': 120}, '14': {'count': 270}, '32': {'count': 60}, '29': {'count': 90}, '6': {'count': 150}, '36': {'count': 120}, '40': {'count': 90}, '41': {'count': 60}, '31': {'count': 270}, '19': {'count': 60}, '0': {'count': 60}, '39': {'count': 90}, '42': {'count': 90}, '37': {'count': 60}}}} | -| [GeoreviewClassification](https://github.com/yandex/geo-reviews-dataset-2023) | ['rus'] | Classification | p2p | [Reviews, Written] | None | None | +| [GeoreviewClassification.v2](https://github.com/yandex/geo-reviews-dataset-2023) | ['rus'] | Classification | p2p | [Reviews, Written] | None | None | | [GeoreviewClusteringP2P](https://github.com/yandex/geo-reviews-dataset-2023) | ['rus'] | Clustering | p2p | [Reviews, Written] | None | None | | [GeorgianFAQRetrieval](https://huggingface.co/datasets/jupyterjazz/georgian-faq) | ['kat'] | Retrieval | s2p | [Web, Written] | None | None | | [GerDaLIR](https://github.com/lavis-nlp/GerDaLIR) (Wrzalik et al., 2021) | ['deu'] | Retrieval | s2p | [Legal] | None | None | | [GerDaLIRSmall](https://github.com/lavis-nlp/GerDaLIR) (Wrzalik et al., 2021) | ['deu'] | Retrieval | p2p | [Legal, Written] | None | None | | [GermanDPR](https://huggingface.co/datasets/deepset/germandpr) (Timo Möller, 2021) | ['deu'] | Retrieval | s2p | [Non-fiction, Web, Written] | None | None | | [GermanGovServiceRetrieval](https://huggingface.co/datasets/it-at-m/LHM-Dienstleistungen-QA) (Schröder et al., 2022) | ['deu'] | Retrieval | s2p | [Government, Written] | None | None | -| [GermanPoliticiansTwitterSentimentClassification](https://aclanthology.org/2022.konvens-1.9) (Schmidt et al., 2022) | ['deu'] | Classification | s2s | [Government, Social, Written] | None | None | +| [GermanPoliticiansTwitterSentimentClassification.v2](https://aclanthology.org/2022.konvens-1.9) (Schmidt et al., 2022) | ['deu'] | Classification | s2s | [Government, Social, Written] | None | None | | [GermanQuAD-Retrieval](https://huggingface.co/datasets/deepset/germanquad) (Timo Möller, 2021) | ['deu'] | Retrieval | s2p | [Non-fiction, Web, Written] | None | None | | [GermanSTSBenchmark](https://github.com/t-systems-on-site-services-gmbh/german-STSbenchmark) (Philip May, 2021) | ['deu'] | STS | s2s | | None | None | +| [GovReport](https://huggingface.co/datasets/launch/gov_report) (Huang et al., 2021) | ['eng'] | Retrieval | t2t | [Government, Legal] | None | None | | [GreekCivicsQA](https://huggingface.co/datasets/antoinelb7/alloprof) | ['ell'] | Retrieval | s2p | [Academic, Written] | None | None | | [GreekLegalCodeClassification](https://arxiv.org/abs/2109.15298) (Papaloukas et al., 2021) | ['ell'] | Classification | s2s | [Legal, Written] | None | None | | [GreenNodeTableMarkdownRetrieval](https://huggingface.co/GreenNode) | ['vie'] | Retrieval | s2p | [Encyclopaedic, Financial, Non-fiction] | None | None | -| [GujaratiNewsClassification](https://github.com/goru001/nlp-for-gujarati) | ['guj'] | Classification | s2s | [News, Written] | None | None | +| [GujaratiNewsClassification.v2](https://github.com/goru001/nlp-for-gujarati) | ['guj'] | Classification | s2s | [News, Written] | None | None | | [HALClusteringS2S.v2](https://huggingface.co/datasets/lyon-nlp/clustering-hal-s2s) (Mathieu Ciancone, 2024) | ['fra'] | Clustering | s2s | [Academic, Written] | None | None | +| [HC3FinanceRetrieval](https://huggingface.co/datasets/embedding-benchmark/HC3Finance) (Guo et al., 2023) | ['eng'] | Retrieval | s2p | [Financial] | {'test': 2400} | {'test': {'number_of_characters': 200000, 'num_samples': 2400, 'num_queries': 1200, 'num_documents': 1200, 'min_document_length': 50, 'average_document_length': 166.67, 'max_document_length': 500, 'unique_documents': 1200, 'min_query_length': 3, 'average_query_length': 3.0, 'max_query_length': 3, 'unique_queries': 1200, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1200}} | | [HagridRetrieval](https://github.com/project-miracl/hagrid) (Ehsan Kamalloo, 2023) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [HamshahriClustring](https://github.com/mallahyari/Farsi-datasets) | ['fas'] | Clustering | p2p | [News] | None | None | | [HateSpeechPortugueseClassification](https://aclanthology.org/W19-3510) (Fortuna et al., 2019) | ['por'] | Classification | s2s | [Social, Written] | None | None | | [HatefulMemesI2TRetrieval](https://arxiv.org/pdf/2005.04790) (Kiela et al., 2020) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | {'test': 11000} | {'test': {'number_of_characters': 610257, 'num_samples': 11000, 'num_queries': 1000, 'num_documents': 10000, 'min_document_length': 3, 'average_document_length': 61.03, 'max_document_length': 433, 'unique_documents': 8045, 'num_document_images': 0, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}} | | [HatefulMemesT2IRetrieval](https://arxiv.org/pdf/2005.04790) (Kiela et al., 2020) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | {'test': 11000} | {'test': {'number_of_characters': 55468, 'num_samples': 11000, 'num_queries': 1000, 'num_documents': 10000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 10000, 'min_query_length': 3, 'average_query_length': 55.47, 'max_query_length': 382, 'unique_queries': 829, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}} | -| [HeadlineClassification](https://aclanthology.org/2020.ngt-1.6/) (Gudkov et al., 2020) | ['rus'] | Classification | s2s | [News, Written] | None | None | -| [HebrewSentimentAnalysis](https://huggingface.co/datasets/hebrew_sentiment) (Amram et al., 2018) | ['heb'] | Classification | s2s | [Reviews, Written] | None | None | +| [HeadlineClassification.v2](https://aclanthology.org/2020.ngt-1.6/) (Gudkov et al., 2020) | ['rus'] | Classification | s2s | [News, Written] | None | None | +| [HebrewSentimentAnalysis.v2](https://huggingface.co/datasets/hebrew_sentiment) (Amram et al., 2018) | ['heb'] | Classification | s2s | [Reviews, Written] | None | None | | [HellaSwag](https://rowanzellers.com/hellaswag/) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [HinDialectClassification](https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-4839) (Bafna et al., 2022) | ['anp', 'awa', 'ben', 'bgc', 'bhb', 'bhd', 'bho', 'bjj', 'bns', 'bra', 'gbm', 'guj', 'hne', 'kfg', 'kfy', 'mag', 'mar', 'mup', 'noe', 'pan', 'raj'] | Classification | s2s | [Social, Spoken, Written] | None | None | -| [HindiDiscourseClassification](https://aclanthology.org/2020.lrec-1.149/) (Dhanwal et al., 2020) | ['hin'] | Classification | s2s | [Fiction, Social, Written] | None | None | +| [HindiDiscourseClassification.v2](https://aclanthology.org/2020.lrec-1.149/) (Dhanwal et al., 2020) | ['hin'] | Classification | s2s | [Fiction, Social, Written] | None | None | | [HotelReviewSentimentClassification.v2](https://link.springer.com/chapter/10.1007/978-3-319-67056-0_3) (Elnagar et al., 2018) | ['ara'] | Classification | s2s | [Reviews, Written] | None | None | | [HotpotQA](https://hotpotqa.github.io/) (Yang et al., 2018) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | | [HotpotQA-Fa](https://huggingface.co/datasets/MCINext/hotpotqa-fa) | ['fas'] | Retrieval | s2p | [Encyclopaedic] | None | None | | [HotpotQA-NL](https://hotpotqa.github.io/) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Web, Written] | None | None | | [HotpotQA-PL](https://hotpotqa.github.io/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | None | | [HotpotQA-PLHardNegatives](https://hotpotqa.github.io/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | None | +| [HotpotQA-VN](https://hotpotqa.github.io/) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Web, Written] | None | None | | [HotpotQAHardNegatives](https://hotpotqa.github.io/) (Yang et al., 2018) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | +| [HumanEvalRetrieval](https://huggingface.co/datasets/embedding-benchmark/HumanEval) (Chen et al., 2021) | ['eng', 'python'] | Retrieval | s2s | [Programming] | {'test': 316} | {'test': {'number_of_characters': 46334, 'num_samples': 316, 'num_queries': 158, 'num_documents': 158, 'min_document_length': 23, 'average_document_length': 291.25, 'max_document_length': 1200, 'unique_documents': 158, 'min_query_length': 2, 'average_query_length': 2.0, 'max_query_length': 2, 'unique_queries': 158, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 158}} | | [HunSum2AbstractiveRetrieval](https://arxiv.org/abs/2404.03555) (Botond Barta, 2024) | ['hun'] | Retrieval | s2p | [News, Written] | None | None | -| [IFlyTek](https://www.cluebenchmarks.com/introduce.html) (Xu et al., 2020) | ['cmn'] | Classification | s2s | | None | None | +| [IFlyTek.v2](https://www.cluebenchmarks.com/introduce.html) (Xu et al., 2020) | ['cmn'] | Classification | s2s | | None | None | | [IN22ConvBitextMining](https://huggingface.co/datasets/ai4bharat/IN22-Conv) (Jay Gala, 2023) | ['asm', 'ben', 'brx', 'doi', 'eng', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | BitextMining | s2s | [Fiction, Social, Spoken, Spoken] | {'test': 760518} | {'test': {'num_samples': 760518, 'number_of_characters': 82637104, 'unique_pairs': 759283, 'min_sentence1_length': 3, 'average_sentence1_length': 54.33, 'max_sentence1_length': 239, 'unique_sentence1': 34430, 'min_sentence2_length': 3, 'average_sentence2_length': 54.33, 'max_sentence2_length': 239, 'unique_sentence2': 34430, 'hf_subset_descriptive_stats': {'asm_Beng-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155988, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'asm_Beng-brx_Deva': {'num_samples': 1503, 'number_of_characters': 162044, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'asm_Beng-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167032, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'asm_Beng-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160716, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'asm_Beng-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156282, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'asm_Beng-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 158269, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'asm_Beng-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159964, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'asm_Beng-kan_Knda': {'num_samples': 1503, 'number_of_characters': 165177, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'asm_Beng-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164681, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'asm_Beng-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162408, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'asm_Beng-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172838, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'asm_Beng-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162747, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'asm_Beng-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157316, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'asm_Beng-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160906, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'asm_Beng-ory_Orya': {'num_samples': 1503, 'number_of_characters': 164223, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'asm_Beng-pan_Guru': {'num_samples': 1503, 'number_of_characters': 160201, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'asm_Beng-san_Deva': {'num_samples': 1503, 'number_of_characters': 158093, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'asm_Beng-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169379, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'asm_Beng-snd_Deva': {'num_samples': 1503, 'number_of_characters': 162623, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'asm_Beng-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174866, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'asm_Beng-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157690, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'asm_Beng-urd_Arab': {'num_samples': 1503, 'number_of_characters': 161305, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'ben_Beng-asm_Beng': {'num_samples': 1503, 'number_of_characters': 155988, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'ben_Beng-brx_Deva': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'ben_Beng-doi_Deva': {'num_samples': 1503, 'number_of_characters': 161436, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'ben_Beng-eng_Latn': {'num_samples': 1503, 'number_of_characters': 155120, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'ben_Beng-gom_Deva': {'num_samples': 1503, 'number_of_characters': 150686, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'ben_Beng-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 152673, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'ben_Beng-hin_Deva': {'num_samples': 1503, 'number_of_characters': 154368, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'ben_Beng-kan_Knda': {'num_samples': 1503, 'number_of_characters': 159581, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'ben_Beng-kas_Arab': {'num_samples': 1503, 'number_of_characters': 159085, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'ben_Beng-mai_Deva': {'num_samples': 1503, 'number_of_characters': 156812, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'ben_Beng-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 167242, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'ben_Beng-mar_Deva': {'num_samples': 1503, 'number_of_characters': 157151, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'ben_Beng-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 151720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'ben_Beng-npi_Deva': {'num_samples': 1503, 'number_of_characters': 155310, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'ben_Beng-ory_Orya': {'num_samples': 1503, 'number_of_characters': 158627, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'ben_Beng-pan_Guru': {'num_samples': 1503, 'number_of_characters': 154605, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'ben_Beng-san_Deva': {'num_samples': 1503, 'number_of_characters': 152497, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'ben_Beng-sat_Olck': {'num_samples': 1503, 'number_of_characters': 163783, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'ben_Beng-snd_Deva': {'num_samples': 1503, 'number_of_characters': 157027, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'ben_Beng-tam_Taml': {'num_samples': 1503, 'number_of_characters': 169270, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'ben_Beng-tel_Telu': {'num_samples': 1503, 'number_of_characters': 152094, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'ben_Beng-urd_Arab': {'num_samples': 1503, 'number_of_characters': 155709, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'brx_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162044, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'brx_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'brx_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167492, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'brx_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161176, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'brx_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156742, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'brx_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'brx_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 160424, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'brx_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 165637, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'brx_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165141, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'brx_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162868, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'brx_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'brx_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163207, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'brx_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157776, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'brx_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'brx_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 164683, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'brx_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 160661, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'brx_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 158553, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'brx_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169839, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'brx_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163083, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'brx_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175326, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'brx_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158150, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'brx_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 161765, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'doi_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 167032, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'doi_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 161436, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'doi_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 167492, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'doi_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 166164, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'doi_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'doi_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 163717, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'doi_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 165412, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'doi_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 170625, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'doi_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 170129, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'doi_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 167856, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'doi_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 178286, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'doi_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 168195, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'doi_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 162764, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'doi_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 166354, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'doi_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 169671, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'doi_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 165649, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'doi_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 163541, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'doi_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 174827, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'doi_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 168071, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'doi_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 180314, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'doi_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 163138, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'doi_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 166753, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'eng_Latn-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160716, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'eng_Latn-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155120, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'eng_Latn-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161176, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'eng_Latn-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166164, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'eng_Latn-gom_Deva': {'num_samples': 1503, 'number_of_characters': 155414, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'eng_Latn-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157401, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'eng_Latn-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159096, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'eng_Latn-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164309, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'eng_Latn-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163813, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'eng_Latn-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161540, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'eng_Latn-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171970, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'eng_Latn-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161879, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'eng_Latn-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'eng_Latn-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160038, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'eng_Latn-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163355, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'eng_Latn-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159333, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'eng_Latn-san_Deva': {'num_samples': 1503, 'number_of_characters': 157225, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'eng_Latn-sat_Olck': {'num_samples': 1503, 'number_of_characters': 168511, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'eng_Latn-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161755, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'eng_Latn-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173998, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'eng_Latn-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156822, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'eng_Latn-urd_Arab': {'num_samples': 1503, 'number_of_characters': 160437, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'gom_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 156282, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'gom_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 150686, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'gom_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 156742, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'gom_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'gom_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 155414, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'gom_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 152967, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'gom_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 154662, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'gom_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 159875, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'gom_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 159379, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'gom_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 157106, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'gom_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 167536, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'gom_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 157445, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'gom_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 152014, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'gom_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 155604, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'gom_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 158921, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'gom_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 154899, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'gom_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 152791, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'gom_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 164077, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'gom_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 157321, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'gom_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 169564, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'gom_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 152388, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'gom_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 156003, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'guj_Gujr-asm_Beng': {'num_samples': 1503, 'number_of_characters': 158269, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'guj_Gujr-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152673, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'guj_Gujr-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'guj_Gujr-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163717, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'guj_Gujr-eng_Latn': {'num_samples': 1503, 'number_of_characters': 157401, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'guj_Gujr-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152967, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'guj_Gujr-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156649, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'guj_Gujr-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161862, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'guj_Gujr-kas_Arab': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'guj_Gujr-mai_Deva': {'num_samples': 1503, 'number_of_characters': 159093, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'guj_Gujr-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 169523, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'guj_Gujr-mar_Deva': {'num_samples': 1503, 'number_of_characters': 159432, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'guj_Gujr-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 154001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'guj_Gujr-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157591, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'guj_Gujr-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160908, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'guj_Gujr-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156886, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'guj_Gujr-san_Deva': {'num_samples': 1503, 'number_of_characters': 154778, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'guj_Gujr-sat_Olck': {'num_samples': 1503, 'number_of_characters': 166064, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'guj_Gujr-snd_Deva': {'num_samples': 1503, 'number_of_characters': 159308, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'guj_Gujr-tam_Taml': {'num_samples': 1503, 'number_of_characters': 171551, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'guj_Gujr-tel_Telu': {'num_samples': 1503, 'number_of_characters': 154375, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'guj_Gujr-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157990, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'hin_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 159964, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'hin_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 154368, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'hin_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 160424, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'hin_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 165412, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'hin_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 159096, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'hin_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 154662, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'hin_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 156649, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'hin_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 163557, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'hin_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163061, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'hin_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 160788, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'hin_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171218, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'hin_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161127, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'hin_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 155696, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'hin_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 159286, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'hin_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 162603, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'hin_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 158581, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'hin_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 156473, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'hin_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 167759, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'hin_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'hin_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173246, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'hin_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156070, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'hin_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 159685, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'kan_Knda-asm_Beng': {'num_samples': 1503, 'number_of_characters': 165177, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'kan_Knda-ben_Beng': {'num_samples': 1503, 'number_of_characters': 159581, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'kan_Knda-brx_Deva': {'num_samples': 1503, 'number_of_characters': 165637, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'kan_Knda-doi_Deva': {'num_samples': 1503, 'number_of_characters': 170625, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'kan_Knda-eng_Latn': {'num_samples': 1503, 'number_of_characters': 164309, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'kan_Knda-gom_Deva': {'num_samples': 1503, 'number_of_characters': 159875, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'kan_Knda-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 161862, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'kan_Knda-hin_Deva': {'num_samples': 1503, 'number_of_characters': 163557, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'kan_Knda-kas_Arab': {'num_samples': 1503, 'number_of_characters': 168274, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'kan_Knda-mai_Deva': {'num_samples': 1503, 'number_of_characters': 166001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'kan_Knda-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 176431, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'kan_Knda-mar_Deva': {'num_samples': 1503, 'number_of_characters': 166340, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'kan_Knda-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 160909, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'kan_Knda-npi_Deva': {'num_samples': 1503, 'number_of_characters': 164499, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'kan_Knda-ory_Orya': {'num_samples': 1503, 'number_of_characters': 167816, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'kan_Knda-pan_Guru': {'num_samples': 1503, 'number_of_characters': 163794, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'kan_Knda-san_Deva': {'num_samples': 1503, 'number_of_characters': 161686, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'kan_Knda-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172972, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'kan_Knda-snd_Deva': {'num_samples': 1503, 'number_of_characters': 166216, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'kan_Knda-tam_Taml': {'num_samples': 1503, 'number_of_characters': 178459, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'kan_Knda-tel_Telu': {'num_samples': 1503, 'number_of_characters': 161283, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'kan_Knda-urd_Arab': {'num_samples': 1503, 'number_of_characters': 164898, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'kas_Arab-asm_Beng': {'num_samples': 1503, 'number_of_characters': 164681, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'kas_Arab-ben_Beng': {'num_samples': 1503, 'number_of_characters': 159085, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'kas_Arab-brx_Deva': {'num_samples': 1503, 'number_of_characters': 165141, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'kas_Arab-doi_Deva': {'num_samples': 1503, 'number_of_characters': 170129, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'kas_Arab-eng_Latn': {'num_samples': 1503, 'number_of_characters': 163813, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'kas_Arab-gom_Deva': {'num_samples': 1503, 'number_of_characters': 159379, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'kas_Arab-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'kas_Arab-hin_Deva': {'num_samples': 1503, 'number_of_characters': 163061, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'kas_Arab-kan_Knda': {'num_samples': 1503, 'number_of_characters': 168274, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'kas_Arab-mai_Deva': {'num_samples': 1503, 'number_of_characters': 165505, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'kas_Arab-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 175935, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'kas_Arab-mar_Deva': {'num_samples': 1503, 'number_of_characters': 165844, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'kas_Arab-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 160413, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'kas_Arab-npi_Deva': {'num_samples': 1503, 'number_of_characters': 164003, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'kas_Arab-ory_Orya': {'num_samples': 1503, 'number_of_characters': 167320, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'kas_Arab-pan_Guru': {'num_samples': 1503, 'number_of_characters': 163298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'kas_Arab-san_Deva': {'num_samples': 1503, 'number_of_characters': 161190, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'kas_Arab-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172476, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'kas_Arab-snd_Deva': {'num_samples': 1503, 'number_of_characters': 165720, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'kas_Arab-tam_Taml': {'num_samples': 1503, 'number_of_characters': 177963, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'kas_Arab-tel_Telu': {'num_samples': 1503, 'number_of_characters': 160787, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'kas_Arab-urd_Arab': {'num_samples': 1503, 'number_of_characters': 164402, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mai_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162408, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mai_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 156812, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mai_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 162868, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mai_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167856, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mai_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161540, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mai_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157106, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mai_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159093, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mai_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 160788, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mai_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166001, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mai_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165505, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mai_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173662, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mai_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163571, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mai_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158140, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mai_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mai_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165047, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mai_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161025, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mai_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 158917, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mai_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170203, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mai_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163447, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mai_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175690, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mai_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158514, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mai_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162129, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mal_Mlym-asm_Beng': {'num_samples': 1503, 'number_of_characters': 172838, 'unique_pairs': 1498, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mal_Mlym-ben_Beng': {'num_samples': 1503, 'number_of_characters': 167242, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mal_Mlym-brx_Deva': {'num_samples': 1503, 'number_of_characters': 173298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mal_Mlym-doi_Deva': {'num_samples': 1503, 'number_of_characters': 178286, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mal_Mlym-eng_Latn': {'num_samples': 1503, 'number_of_characters': 171970, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mal_Mlym-gom_Deva': {'num_samples': 1503, 'number_of_characters': 167536, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mal_Mlym-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 169523, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mal_Mlym-hin_Deva': {'num_samples': 1503, 'number_of_characters': 171218, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mal_Mlym-kan_Knda': {'num_samples': 1503, 'number_of_characters': 176431, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mal_Mlym-kas_Arab': {'num_samples': 1503, 'number_of_characters': 175935, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mal_Mlym-mai_Deva': {'num_samples': 1503, 'number_of_characters': 173662, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mal_Mlym-mar_Deva': {'num_samples': 1503, 'number_of_characters': 174001, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mal_Mlym-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 168570, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mal_Mlym-npi_Deva': {'num_samples': 1503, 'number_of_characters': 172160, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mal_Mlym-ory_Orya': {'num_samples': 1503, 'number_of_characters': 175477, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mal_Mlym-pan_Guru': {'num_samples': 1503, 'number_of_characters': 171455, 'unique_pairs': 1498, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mal_Mlym-san_Deva': {'num_samples': 1503, 'number_of_characters': 169347, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mal_Mlym-sat_Olck': {'num_samples': 1503, 'number_of_characters': 180633, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mal_Mlym-snd_Deva': {'num_samples': 1503, 'number_of_characters': 173877, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mal_Mlym-tam_Taml': {'num_samples': 1503, 'number_of_characters': 186120, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mal_Mlym-tel_Telu': {'num_samples': 1503, 'number_of_characters': 168944, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mal_Mlym-urd_Arab': {'num_samples': 1503, 'number_of_characters': 172559, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mar_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162747, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mar_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 157151, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mar_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 163207, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mar_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 168195, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mar_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161879, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mar_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157445, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mar_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159432, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mar_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 161127, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mar_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166340, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mar_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165844, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mar_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 163571, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mar_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 174001, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mar_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158479, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mar_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 162069, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mar_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165386, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mar_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161364, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mar_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 159256, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mar_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170542, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mar_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163786, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mar_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 176029, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mar_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158853, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mar_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162468, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mni_Mtei-asm_Beng': {'num_samples': 1503, 'number_of_characters': 157316, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mni_Mtei-ben_Beng': {'num_samples': 1503, 'number_of_characters': 151720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mni_Mtei-brx_Deva': {'num_samples': 1503, 'number_of_characters': 157776, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mni_Mtei-doi_Deva': {'num_samples': 1503, 'number_of_characters': 162764, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mni_Mtei-eng_Latn': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mni_Mtei-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152014, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mni_Mtei-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mni_Mtei-hin_Deva': {'num_samples': 1503, 'number_of_characters': 155696, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mni_Mtei-kan_Knda': {'num_samples': 1503, 'number_of_characters': 160909, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mni_Mtei-kas_Arab': {'num_samples': 1503, 'number_of_characters': 160413, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mni_Mtei-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158140, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mni_Mtei-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 168570, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mni_Mtei-mar_Deva': {'num_samples': 1503, 'number_of_characters': 158479, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mni_Mtei-npi_Deva': {'num_samples': 1503, 'number_of_characters': 156638, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mni_Mtei-ory_Orya': {'num_samples': 1503, 'number_of_characters': 159955, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mni_Mtei-pan_Guru': {'num_samples': 1503, 'number_of_characters': 155933, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mni_Mtei-san_Deva': {'num_samples': 1503, 'number_of_characters': 153825, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mni_Mtei-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165111, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mni_Mtei-snd_Deva': {'num_samples': 1503, 'number_of_characters': 158355, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mni_Mtei-tam_Taml': {'num_samples': 1503, 'number_of_characters': 170598, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mni_Mtei-tel_Telu': {'num_samples': 1503, 'number_of_characters': 153422, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mni_Mtei-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157037, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'npi_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160906, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'npi_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155310, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'npi_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'npi_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166354, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'npi_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160038, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'npi_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 155604, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'npi_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157591, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'npi_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159286, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'npi_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164499, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'npi_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'npi_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'npi_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172160, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'npi_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162069, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'npi_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 156638, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'npi_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163545, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'npi_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159523, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'npi_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 157415, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'npi_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 168701, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'npi_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161945, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'npi_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174188, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'npi_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157012, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'npi_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 160627, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'ory_Orya-asm_Beng': {'num_samples': 1503, 'number_of_characters': 164223, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'ory_Orya-ben_Beng': {'num_samples': 1503, 'number_of_characters': 158627, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'ory_Orya-brx_Deva': {'num_samples': 1503, 'number_of_characters': 164683, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'ory_Orya-doi_Deva': {'num_samples': 1503, 'number_of_characters': 169671, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'ory_Orya-eng_Latn': {'num_samples': 1503, 'number_of_characters': 163355, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'ory_Orya-gom_Deva': {'num_samples': 1503, 'number_of_characters': 158921, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'ory_Orya-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 160908, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'ory_Orya-hin_Deva': {'num_samples': 1503, 'number_of_characters': 162603, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'ory_Orya-kan_Knda': {'num_samples': 1503, 'number_of_characters': 167816, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'ory_Orya-kas_Arab': {'num_samples': 1503, 'number_of_characters': 167320, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'ory_Orya-mai_Deva': {'num_samples': 1503, 'number_of_characters': 165047, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'ory_Orya-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 175477, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'ory_Orya-mar_Deva': {'num_samples': 1503, 'number_of_characters': 165386, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'ory_Orya-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 159955, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'ory_Orya-npi_Deva': {'num_samples': 1503, 'number_of_characters': 163545, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'ory_Orya-pan_Guru': {'num_samples': 1503, 'number_of_characters': 162840, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'ory_Orya-san_Deva': {'num_samples': 1503, 'number_of_characters': 160732, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'ory_Orya-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172018, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'ory_Orya-snd_Deva': {'num_samples': 1503, 'number_of_characters': 165262, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'ory_Orya-tam_Taml': {'num_samples': 1503, 'number_of_characters': 177505, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'ory_Orya-tel_Telu': {'num_samples': 1503, 'number_of_characters': 160329, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'ory_Orya-urd_Arab': {'num_samples': 1503, 'number_of_characters': 163944, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'pan_Guru-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160201, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'pan_Guru-ben_Beng': {'num_samples': 1503, 'number_of_characters': 154605, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'pan_Guru-brx_Deva': {'num_samples': 1503, 'number_of_characters': 160661, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'pan_Guru-doi_Deva': {'num_samples': 1503, 'number_of_characters': 165649, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'pan_Guru-eng_Latn': {'num_samples': 1503, 'number_of_characters': 159333, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'pan_Guru-gom_Deva': {'num_samples': 1503, 'number_of_characters': 154899, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'pan_Guru-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 156886, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'pan_Guru-hin_Deva': {'num_samples': 1503, 'number_of_characters': 158581, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'pan_Guru-kan_Knda': {'num_samples': 1503, 'number_of_characters': 163794, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'pan_Guru-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163298, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'pan_Guru-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161025, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'pan_Guru-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171455, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'pan_Guru-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161364, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'pan_Guru-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 155933, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'pan_Guru-npi_Deva': {'num_samples': 1503, 'number_of_characters': 159523, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'pan_Guru-ory_Orya': {'num_samples': 1503, 'number_of_characters': 162840, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'pan_Guru-san_Deva': {'num_samples': 1503, 'number_of_characters': 156710, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'pan_Guru-sat_Olck': {'num_samples': 1503, 'number_of_characters': 167996, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'pan_Guru-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161240, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'pan_Guru-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173483, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'pan_Guru-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156307, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'pan_Guru-urd_Arab': {'num_samples': 1503, 'number_of_characters': 159922, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'san_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 158093, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'san_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152497, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'san_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158553, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'san_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163541, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'san_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 157225, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'san_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152791, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'san_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154778, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'san_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156473, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'san_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161686, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'san_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 161190, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'san_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158917, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'san_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 169347, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'san_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 159256, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'san_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 153825, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'san_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157415, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'san_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160732, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'san_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156710, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'san_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165888, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'san_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 159132, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'san_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 171375, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'san_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 154199, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'san_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157814, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'sat_Olck-asm_Beng': {'num_samples': 1503, 'number_of_characters': 169379, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'sat_Olck-ben_Beng': {'num_samples': 1503, 'number_of_characters': 163783, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'sat_Olck-brx_Deva': {'num_samples': 1503, 'number_of_characters': 169839, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'sat_Olck-doi_Deva': {'num_samples': 1503, 'number_of_characters': 174827, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'sat_Olck-eng_Latn': {'num_samples': 1503, 'number_of_characters': 168511, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'sat_Olck-gom_Deva': {'num_samples': 1503, 'number_of_characters': 164077, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'sat_Olck-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 166064, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'sat_Olck-hin_Deva': {'num_samples': 1503, 'number_of_characters': 167759, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'sat_Olck-kan_Knda': {'num_samples': 1503, 'number_of_characters': 172972, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'sat_Olck-kas_Arab': {'num_samples': 1503, 'number_of_characters': 172476, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'sat_Olck-mai_Deva': {'num_samples': 1503, 'number_of_characters': 170203, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'sat_Olck-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 180633, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'sat_Olck-mar_Deva': {'num_samples': 1503, 'number_of_characters': 170542, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'sat_Olck-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 165111, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'sat_Olck-npi_Deva': {'num_samples': 1503, 'number_of_characters': 168701, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'sat_Olck-ory_Orya': {'num_samples': 1503, 'number_of_characters': 172018, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'sat_Olck-pan_Guru': {'num_samples': 1503, 'number_of_characters': 167996, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'sat_Olck-san_Deva': {'num_samples': 1503, 'number_of_characters': 165888, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'sat_Olck-snd_Deva': {'num_samples': 1503, 'number_of_characters': 170418, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'sat_Olck-tam_Taml': {'num_samples': 1503, 'number_of_characters': 182661, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'sat_Olck-tel_Telu': {'num_samples': 1503, 'number_of_characters': 165485, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'sat_Olck-urd_Arab': {'num_samples': 1503, 'number_of_characters': 169100, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'snd_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162623, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'snd_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 157027, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'snd_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 163083, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'snd_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 168071, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'snd_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161755, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'snd_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157321, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'snd_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159308, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'snd_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 161003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'snd_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166216, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'snd_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'snd_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 163447, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'snd_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173877, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'snd_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163786, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'snd_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158355, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'snd_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161945, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'snd_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165262, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'snd_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161240, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'snd_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 159132, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'snd_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170418, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'snd_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175905, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'snd_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'snd_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162344, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'tam_Taml-asm_Beng': {'num_samples': 1503, 'number_of_characters': 174866, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'tam_Taml-ben_Beng': {'num_samples': 1503, 'number_of_characters': 169270, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'tam_Taml-brx_Deva': {'num_samples': 1503, 'number_of_characters': 175326, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'tam_Taml-doi_Deva': {'num_samples': 1503, 'number_of_characters': 180314, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'tam_Taml-eng_Latn': {'num_samples': 1503, 'number_of_characters': 173998, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'tam_Taml-gom_Deva': {'num_samples': 1503, 'number_of_characters': 169564, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'tam_Taml-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 171551, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'tam_Taml-hin_Deva': {'num_samples': 1503, 'number_of_characters': 173246, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'tam_Taml-kan_Knda': {'num_samples': 1503, 'number_of_characters': 178459, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'tam_Taml-kas_Arab': {'num_samples': 1503, 'number_of_characters': 177963, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'tam_Taml-mai_Deva': {'num_samples': 1503, 'number_of_characters': 175690, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'tam_Taml-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 186120, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'tam_Taml-mar_Deva': {'num_samples': 1503, 'number_of_characters': 176029, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'tam_Taml-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 170598, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'tam_Taml-npi_Deva': {'num_samples': 1503, 'number_of_characters': 174188, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'tam_Taml-ory_Orya': {'num_samples': 1503, 'number_of_characters': 177505, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'tam_Taml-pan_Guru': {'num_samples': 1503, 'number_of_characters': 173483, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'tam_Taml-san_Deva': {'num_samples': 1503, 'number_of_characters': 171375, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'tam_Taml-sat_Olck': {'num_samples': 1503, 'number_of_characters': 182661, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'tam_Taml-snd_Deva': {'num_samples': 1503, 'number_of_characters': 175905, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'tam_Taml-tel_Telu': {'num_samples': 1503, 'number_of_characters': 170972, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'tam_Taml-urd_Arab': {'num_samples': 1503, 'number_of_characters': 174587, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'tel_Telu-asm_Beng': {'num_samples': 1503, 'number_of_characters': 157690, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'tel_Telu-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152094, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'tel_Telu-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158150, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'tel_Telu-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163138, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'tel_Telu-eng_Latn': {'num_samples': 1503, 'number_of_characters': 156822, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'tel_Telu-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152388, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'tel_Telu-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154375, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'tel_Telu-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156070, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'tel_Telu-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161283, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'tel_Telu-kas_Arab': {'num_samples': 1503, 'number_of_characters': 160787, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'tel_Telu-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158514, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'tel_Telu-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 168944, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'tel_Telu-mar_Deva': {'num_samples': 1503, 'number_of_characters': 158853, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'tel_Telu-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 153422, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'tel_Telu-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157012, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'tel_Telu-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160329, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'tel_Telu-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156307, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'tel_Telu-san_Deva': {'num_samples': 1503, 'number_of_characters': 154199, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'tel_Telu-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165485, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'tel_Telu-snd_Deva': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'tel_Telu-tam_Taml': {'num_samples': 1503, 'number_of_characters': 170972, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'tel_Telu-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157411, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'urd_Arab-asm_Beng': {'num_samples': 1503, 'number_of_characters': 161305, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'urd_Arab-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155709, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'urd_Arab-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161765, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'urd_Arab-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166753, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'urd_Arab-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160437, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'urd_Arab-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'urd_Arab-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157990, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'urd_Arab-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159685, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'urd_Arab-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164898, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'urd_Arab-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164402, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'urd_Arab-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162129, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'urd_Arab-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172559, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'urd_Arab-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162468, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'urd_Arab-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157037, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'urd_Arab-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160627, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'urd_Arab-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163944, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'urd_Arab-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159922, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'urd_Arab-san_Deva': {'num_samples': 1503, 'number_of_characters': 157814, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'urd_Arab-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169100, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'urd_Arab-snd_Deva': {'num_samples': 1503, 'number_of_characters': 162344, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'urd_Arab-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174587, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'urd_Arab-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157411, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}}}} | | [IN22GenBitextMining](https://huggingface.co/datasets/ai4bharat/IN22-Gen) (Jay Gala, 2023) | ['asm', 'ben', 'brx', 'doi', 'eng', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | BitextMining | s2s | [Government, Legal, News, Non-fiction, Religious, Web, Written] | {'test': 518144} | {'test': {'num_samples': 518144, 'number_of_characters': 162367876, 'unique_pairs': 518101, 'min_sentence1_length': 9, 'average_sentence1_length': 156.68, 'max_sentence1_length': 692, 'unique_sentence1': 23550, 'min_sentence2_length': 9, 'average_sentence2_length': 156.68, 'max_sentence2_length': 692, 'unique_sentence2': 23550, 'hf_subset_descriptive_stats': {'asm_Beng-ben_Beng': {'num_samples': 1024, 'number_of_characters': 310622, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'asm_Beng-brx_Deva': {'num_samples': 1024, 'number_of_characters': 323609, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'asm_Beng-doi_Deva': {'num_samples': 1024, 'number_of_characters': 319020, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'asm_Beng-eng_Latn': {'num_samples': 1024, 'number_of_characters': 320098, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'asm_Beng-gom_Deva': {'num_samples': 1024, 'number_of_characters': 312594, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'asm_Beng-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 309440, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'asm_Beng-hin_Deva': {'num_samples': 1024, 'number_of_characters': 320106, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'asm_Beng-kan_Knda': {'num_samples': 1024, 'number_of_characters': 332064, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'asm_Beng-kas_Arab': {'num_samples': 1024, 'number_of_characters': 322764, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'asm_Beng-mai_Deva': {'num_samples': 1024, 'number_of_characters': 308682, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'asm_Beng-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 343636, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'asm_Beng-mar_Deva': {'num_samples': 1024, 'number_of_characters': 321784, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'asm_Beng-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 313134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'asm_Beng-npi_Deva': {'num_samples': 1024, 'number_of_characters': 313419, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'asm_Beng-ory_Orya': {'num_samples': 1024, 'number_of_characters': 334226, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'asm_Beng-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306863, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'asm_Beng-san_Deva': {'num_samples': 1024, 'number_of_characters': 318079, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'asm_Beng-sat_Olck': {'num_samples': 1024, 'number_of_characters': 326732, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'asm_Beng-snd_Deva': {'num_samples': 1024, 'number_of_characters': 320421, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'asm_Beng-tam_Taml': {'num_samples': 1024, 'number_of_characters': 348346, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'asm_Beng-tel_Telu': {'num_samples': 1024, 'number_of_characters': 319045, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'asm_Beng-urd_Arab': {'num_samples': 1024, 'number_of_characters': 315134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'ben_Beng-asm_Beng': {'num_samples': 1024, 'number_of_characters': 310622, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'ben_Beng-brx_Deva': {'num_samples': 1024, 'number_of_characters': 313313, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'ben_Beng-doi_Deva': {'num_samples': 1024, 'number_of_characters': 308724, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'ben_Beng-eng_Latn': {'num_samples': 1024, 'number_of_characters': 309802, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'ben_Beng-gom_Deva': {'num_samples': 1024, 'number_of_characters': 302298, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'ben_Beng-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 299144, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'ben_Beng-hin_Deva': {'num_samples': 1024, 'number_of_characters': 309810, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'ben_Beng-kan_Knda': {'num_samples': 1024, 'number_of_characters': 321768, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'ben_Beng-kas_Arab': {'num_samples': 1024, 'number_of_characters': 312468, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'ben_Beng-mai_Deva': {'num_samples': 1024, 'number_of_characters': 298386, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'ben_Beng-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 333340, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'ben_Beng-mar_Deva': {'num_samples': 1024, 'number_of_characters': 311488, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'ben_Beng-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 302838, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'ben_Beng-npi_Deva': {'num_samples': 1024, 'number_of_characters': 303123, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'ben_Beng-ory_Orya': {'num_samples': 1024, 'number_of_characters': 323930, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'ben_Beng-pan_Guru': {'num_samples': 1024, 'number_of_characters': 296567, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'ben_Beng-san_Deva': {'num_samples': 1024, 'number_of_characters': 307783, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'ben_Beng-sat_Olck': {'num_samples': 1024, 'number_of_characters': 316436, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'ben_Beng-snd_Deva': {'num_samples': 1024, 'number_of_characters': 310125, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'ben_Beng-tam_Taml': {'num_samples': 1024, 'number_of_characters': 338050, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'ben_Beng-tel_Telu': {'num_samples': 1024, 'number_of_characters': 308749, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'ben_Beng-urd_Arab': {'num_samples': 1024, 'number_of_characters': 304838, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'brx_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 323609, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'brx_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 313313, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'brx_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 321711, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'brx_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 322789, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'brx_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 315285, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'brx_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 312131, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'brx_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 322797, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'brx_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 334755, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'brx_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 325455, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'brx_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 311373, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'brx_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 346327, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'brx_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 324475, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'brx_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 315825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'brx_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 316110, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'brx_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 336917, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'brx_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 309554, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'brx_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 320770, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'brx_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 329423, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'brx_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 323112, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'brx_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 351037, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'brx_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 321736, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'brx_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 317825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'doi_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 319020, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'doi_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 308724, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'doi_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 321711, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'doi_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 318200, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'doi_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 310696, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'doi_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 307542, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'doi_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 318208, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'doi_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 330166, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'doi_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 320866, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'doi_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 306784, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'doi_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 341738, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'doi_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 319886, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'doi_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 311236, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'doi_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 311521, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'doi_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 332328, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'doi_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304965, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'doi_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 316181, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'doi_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 324834, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'doi_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 318523, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'doi_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 346448, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'doi_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 317147, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'doi_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 313236, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'eng_Latn-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320098, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'eng_Latn-ben_Beng': {'num_samples': 1024, 'number_of_characters': 309802, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'eng_Latn-brx_Deva': {'num_samples': 1024, 'number_of_characters': 322789, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'eng_Latn-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318200, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'eng_Latn-gom_Deva': {'num_samples': 1024, 'number_of_characters': 311774, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'eng_Latn-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308620, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'eng_Latn-hin_Deva': {'num_samples': 1024, 'number_of_characters': 319286, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'eng_Latn-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331244, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'eng_Latn-kas_Arab': {'num_samples': 1024, 'number_of_characters': 321944, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'eng_Latn-mai_Deva': {'num_samples': 1024, 'number_of_characters': 307862, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'eng_Latn-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 342816, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'eng_Latn-mar_Deva': {'num_samples': 1024, 'number_of_characters': 320964, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'eng_Latn-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312314, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'eng_Latn-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312599, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'eng_Latn-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333406, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'eng_Latn-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306043, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'eng_Latn-san_Deva': {'num_samples': 1024, 'number_of_characters': 317259, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'eng_Latn-sat_Olck': {'num_samples': 1024, 'number_of_characters': 325912, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'eng_Latn-snd_Deva': {'num_samples': 1024, 'number_of_characters': 319601, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'eng_Latn-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347526, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'eng_Latn-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318225, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'eng_Latn-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314314, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'gom_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 312594, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'gom_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 302298, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'gom_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 315285, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'gom_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 310696, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'gom_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 311774, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'gom_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301116, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'gom_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 311782, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'gom_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 323740, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'gom_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 314440, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'gom_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 300358, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'gom_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 335312, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'gom_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 313460, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'gom_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 304810, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'gom_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 305095, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'gom_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 325902, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'gom_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 298539, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'gom_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 309755, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'gom_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 318408, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'gom_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312097, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'gom_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340022, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'gom_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 310721, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'gom_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 306810, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'guj_Gujr-asm_Beng': {'num_samples': 1024, 'number_of_characters': 309440, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'guj_Gujr-ben_Beng': {'num_samples': 1024, 'number_of_characters': 299144, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'guj_Gujr-brx_Deva': {'num_samples': 1024, 'number_of_characters': 312131, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'guj_Gujr-doi_Deva': {'num_samples': 1024, 'number_of_characters': 307542, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'guj_Gujr-eng_Latn': {'num_samples': 1024, 'number_of_characters': 308620, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'guj_Gujr-gom_Deva': {'num_samples': 1024, 'number_of_characters': 301116, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'guj_Gujr-hin_Deva': {'num_samples': 1024, 'number_of_characters': 308628, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'guj_Gujr-kan_Knda': {'num_samples': 1024, 'number_of_characters': 320586, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'guj_Gujr-kas_Arab': {'num_samples': 1024, 'number_of_characters': 311286, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'guj_Gujr-mai_Deva': {'num_samples': 1024, 'number_of_characters': 297204, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'guj_Gujr-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 332158, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'guj_Gujr-mar_Deva': {'num_samples': 1024, 'number_of_characters': 310306, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'guj_Gujr-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 301656, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'guj_Gujr-npi_Deva': {'num_samples': 1024, 'number_of_characters': 301941, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'guj_Gujr-ory_Orya': {'num_samples': 1024, 'number_of_characters': 322748, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'guj_Gujr-pan_Guru': {'num_samples': 1024, 'number_of_characters': 295385, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'guj_Gujr-san_Deva': {'num_samples': 1024, 'number_of_characters': 306601, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'guj_Gujr-sat_Olck': {'num_samples': 1024, 'number_of_characters': 315254, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'guj_Gujr-snd_Deva': {'num_samples': 1024, 'number_of_characters': 308943, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'guj_Gujr-tam_Taml': {'num_samples': 1024, 'number_of_characters': 336868, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'guj_Gujr-tel_Telu': {'num_samples': 1024, 'number_of_characters': 307567, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'guj_Gujr-urd_Arab': {'num_samples': 1024, 'number_of_characters': 303656, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'hin_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320106, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'hin_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 309810, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'hin_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 322797, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'hin_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318208, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'hin_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 319286, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'hin_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 311782, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'hin_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308628, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'hin_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331252, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'hin_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 321952, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'hin_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 307870, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'hin_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 342824, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'hin_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 320972, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'hin_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312322, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'hin_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312607, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'hin_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333414, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'hin_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306051, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'hin_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 317267, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'hin_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 325920, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'hin_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 319609, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'hin_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347534, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'hin_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318233, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'hin_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314322, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'kan_Knda-asm_Beng': {'num_samples': 1024, 'number_of_characters': 332064, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'kan_Knda-ben_Beng': {'num_samples': 1024, 'number_of_characters': 321768, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'kan_Knda-brx_Deva': {'num_samples': 1024, 'number_of_characters': 334755, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'kan_Knda-doi_Deva': {'num_samples': 1024, 'number_of_characters': 330166, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'kan_Knda-eng_Latn': {'num_samples': 1024, 'number_of_characters': 331244, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'kan_Knda-gom_Deva': {'num_samples': 1024, 'number_of_characters': 323740, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'kan_Knda-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 320586, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'kan_Knda-hin_Deva': {'num_samples': 1024, 'number_of_characters': 331252, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'kan_Knda-kas_Arab': {'num_samples': 1024, 'number_of_characters': 333910, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'kan_Knda-mai_Deva': {'num_samples': 1024, 'number_of_characters': 319828, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'kan_Knda-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 354782, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'kan_Knda-mar_Deva': {'num_samples': 1024, 'number_of_characters': 332930, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'kan_Knda-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 324280, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'kan_Knda-npi_Deva': {'num_samples': 1024, 'number_of_characters': 324565, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'kan_Knda-ory_Orya': {'num_samples': 1024, 'number_of_characters': 345372, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'kan_Knda-pan_Guru': {'num_samples': 1024, 'number_of_characters': 318009, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'kan_Knda-san_Deva': {'num_samples': 1024, 'number_of_characters': 329225, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'kan_Knda-sat_Olck': {'num_samples': 1024, 'number_of_characters': 337878, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'kan_Knda-snd_Deva': {'num_samples': 1024, 'number_of_characters': 331567, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'kan_Knda-tam_Taml': {'num_samples': 1024, 'number_of_characters': 359492, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'kan_Knda-tel_Telu': {'num_samples': 1024, 'number_of_characters': 330191, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'kan_Knda-urd_Arab': {'num_samples': 1024, 'number_of_characters': 326280, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'kas_Arab-asm_Beng': {'num_samples': 1024, 'number_of_characters': 322764, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'kas_Arab-ben_Beng': {'num_samples': 1024, 'number_of_characters': 312468, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'kas_Arab-brx_Deva': {'num_samples': 1024, 'number_of_characters': 325455, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'kas_Arab-doi_Deva': {'num_samples': 1024, 'number_of_characters': 320866, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'kas_Arab-eng_Latn': {'num_samples': 1024, 'number_of_characters': 321944, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'kas_Arab-gom_Deva': {'num_samples': 1024, 'number_of_characters': 314440, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'kas_Arab-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 311286, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'kas_Arab-hin_Deva': {'num_samples': 1024, 'number_of_characters': 321952, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'kas_Arab-kan_Knda': {'num_samples': 1024, 'number_of_characters': 333910, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'kas_Arab-mai_Deva': {'num_samples': 1024, 'number_of_characters': 310528, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'kas_Arab-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 345482, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'kas_Arab-mar_Deva': {'num_samples': 1024, 'number_of_characters': 323630, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'kas_Arab-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 314980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'kas_Arab-npi_Deva': {'num_samples': 1024, 'number_of_characters': 315265, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'kas_Arab-ory_Orya': {'num_samples': 1024, 'number_of_characters': 336072, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'kas_Arab-pan_Guru': {'num_samples': 1024, 'number_of_characters': 308709, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'kas_Arab-san_Deva': {'num_samples': 1024, 'number_of_characters': 319925, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'kas_Arab-sat_Olck': {'num_samples': 1024, 'number_of_characters': 328578, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'kas_Arab-snd_Deva': {'num_samples': 1024, 'number_of_characters': 322267, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'kas_Arab-tam_Taml': {'num_samples': 1024, 'number_of_characters': 350192, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'kas_Arab-tel_Telu': {'num_samples': 1024, 'number_of_characters': 320891, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'kas_Arab-urd_Arab': {'num_samples': 1024, 'number_of_characters': 316980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mai_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 308682, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mai_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 298386, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mai_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 311373, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mai_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 306784, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mai_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 307862, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mai_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 300358, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mai_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 297204, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mai_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 307870, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mai_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 319828, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mai_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 310528, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mai_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 331400, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mai_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 309548, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mai_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 300898, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mai_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 301183, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mai_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 321990, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mai_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 294627, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mai_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 305843, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mai_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 314496, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mai_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 308185, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mai_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 336110, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mai_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 306809, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mai_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 302898, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mal_Mlym-asm_Beng': {'num_samples': 1024, 'number_of_characters': 343636, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mal_Mlym-ben_Beng': {'num_samples': 1024, 'number_of_characters': 333340, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mal_Mlym-brx_Deva': {'num_samples': 1024, 'number_of_characters': 346327, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mal_Mlym-doi_Deva': {'num_samples': 1024, 'number_of_characters': 341738, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mal_Mlym-eng_Latn': {'num_samples': 1024, 'number_of_characters': 342816, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mal_Mlym-gom_Deva': {'num_samples': 1024, 'number_of_characters': 335312, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mal_Mlym-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 332158, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mal_Mlym-hin_Deva': {'num_samples': 1024, 'number_of_characters': 342824, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mal_Mlym-kan_Knda': {'num_samples': 1024, 'number_of_characters': 354782, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mal_Mlym-kas_Arab': {'num_samples': 1024, 'number_of_characters': 345482, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mal_Mlym-mai_Deva': {'num_samples': 1024, 'number_of_characters': 331400, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mal_Mlym-mar_Deva': {'num_samples': 1024, 'number_of_characters': 344502, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mal_Mlym-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 335852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mal_Mlym-npi_Deva': {'num_samples': 1024, 'number_of_characters': 336137, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mal_Mlym-ory_Orya': {'num_samples': 1024, 'number_of_characters': 356944, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mal_Mlym-pan_Guru': {'num_samples': 1024, 'number_of_characters': 329581, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mal_Mlym-san_Deva': {'num_samples': 1024, 'number_of_characters': 340797, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mal_Mlym-sat_Olck': {'num_samples': 1024, 'number_of_characters': 349450, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mal_Mlym-snd_Deva': {'num_samples': 1024, 'number_of_characters': 343139, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mal_Mlym-tam_Taml': {'num_samples': 1024, 'number_of_characters': 371064, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mal_Mlym-tel_Telu': {'num_samples': 1024, 'number_of_characters': 341763, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mal_Mlym-urd_Arab': {'num_samples': 1024, 'number_of_characters': 337852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mar_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 321784, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mar_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 311488, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mar_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 324475, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mar_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 319886, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mar_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 320964, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mar_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 313460, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mar_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 310306, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mar_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 320972, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mar_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 332930, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mar_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 323630, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mar_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 309548, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mar_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 344502, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mar_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 314000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mar_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 314285, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mar_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 335092, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mar_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 307729, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mar_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 318945, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mar_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 327598, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mar_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 321287, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mar_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 349212, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mar_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 319911, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mar_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 316000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mni_Mtei-asm_Beng': {'num_samples': 1024, 'number_of_characters': 313134, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mni_Mtei-ben_Beng': {'num_samples': 1024, 'number_of_characters': 302838, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mni_Mtei-brx_Deva': {'num_samples': 1024, 'number_of_characters': 315825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mni_Mtei-doi_Deva': {'num_samples': 1024, 'number_of_characters': 311236, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mni_Mtei-eng_Latn': {'num_samples': 1024, 'number_of_characters': 312314, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mni_Mtei-gom_Deva': {'num_samples': 1024, 'number_of_characters': 304810, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mni_Mtei-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301656, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mni_Mtei-hin_Deva': {'num_samples': 1024, 'number_of_characters': 312322, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mni_Mtei-kan_Knda': {'num_samples': 1024, 'number_of_characters': 324280, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mni_Mtei-kas_Arab': {'num_samples': 1024, 'number_of_characters': 314980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mni_Mtei-mai_Deva': {'num_samples': 1024, 'number_of_characters': 300898, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mni_Mtei-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 335852, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mni_Mtei-mar_Deva': {'num_samples': 1024, 'number_of_characters': 314000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mni_Mtei-npi_Deva': {'num_samples': 1024, 'number_of_characters': 305635, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mni_Mtei-ory_Orya': {'num_samples': 1024, 'number_of_characters': 326442, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mni_Mtei-pan_Guru': {'num_samples': 1024, 'number_of_characters': 299079, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mni_Mtei-san_Deva': {'num_samples': 1024, 'number_of_characters': 310295, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mni_Mtei-sat_Olck': {'num_samples': 1024, 'number_of_characters': 318948, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mni_Mtei-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312637, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mni_Mtei-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340562, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mni_Mtei-tel_Telu': {'num_samples': 1024, 'number_of_characters': 311261, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mni_Mtei-urd_Arab': {'num_samples': 1024, 'number_of_characters': 307350, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'npi_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 313419, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'npi_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 303123, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'npi_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 316110, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'npi_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 311521, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'npi_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 312599, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'npi_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 305095, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'npi_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301941, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'npi_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 312607, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'npi_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 324565, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'npi_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 315265, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'npi_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 301183, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'npi_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 336137, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'npi_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 314285, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'npi_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 305635, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'npi_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 326727, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'npi_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 299364, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'npi_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 310580, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'npi_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 319233, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'npi_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312922, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'npi_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340847, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'npi_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 311546, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'npi_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 307635, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'ory_Orya-asm_Beng': {'num_samples': 1024, 'number_of_characters': 334226, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'ory_Orya-ben_Beng': {'num_samples': 1024, 'number_of_characters': 323930, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'ory_Orya-brx_Deva': {'num_samples': 1024, 'number_of_characters': 336917, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'ory_Orya-doi_Deva': {'num_samples': 1024, 'number_of_characters': 332328, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'ory_Orya-eng_Latn': {'num_samples': 1024, 'number_of_characters': 333406, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'ory_Orya-gom_Deva': {'num_samples': 1024, 'number_of_characters': 325902, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'ory_Orya-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 322748, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'ory_Orya-hin_Deva': {'num_samples': 1024, 'number_of_characters': 333414, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'ory_Orya-kan_Knda': {'num_samples': 1024, 'number_of_characters': 345372, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'ory_Orya-kas_Arab': {'num_samples': 1024, 'number_of_characters': 336072, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'ory_Orya-mai_Deva': {'num_samples': 1024, 'number_of_characters': 321990, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'ory_Orya-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 356944, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'ory_Orya-mar_Deva': {'num_samples': 1024, 'number_of_characters': 335092, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'ory_Orya-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 326442, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'ory_Orya-npi_Deva': {'num_samples': 1024, 'number_of_characters': 326727, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'ory_Orya-pan_Guru': {'num_samples': 1024, 'number_of_characters': 320171, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'ory_Orya-san_Deva': {'num_samples': 1024, 'number_of_characters': 331387, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'ory_Orya-sat_Olck': {'num_samples': 1024, 'number_of_characters': 340040, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'ory_Orya-snd_Deva': {'num_samples': 1024, 'number_of_characters': 333729, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'ory_Orya-tam_Taml': {'num_samples': 1024, 'number_of_characters': 361654, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'ory_Orya-tel_Telu': {'num_samples': 1024, 'number_of_characters': 332353, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'ory_Orya-urd_Arab': {'num_samples': 1024, 'number_of_characters': 328442, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'pan_Guru-asm_Beng': {'num_samples': 1024, 'number_of_characters': 306863, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'pan_Guru-ben_Beng': {'num_samples': 1024, 'number_of_characters': 296567, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'pan_Guru-brx_Deva': {'num_samples': 1024, 'number_of_characters': 309554, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'pan_Guru-doi_Deva': {'num_samples': 1024, 'number_of_characters': 304965, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'pan_Guru-eng_Latn': {'num_samples': 1024, 'number_of_characters': 306043, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'pan_Guru-gom_Deva': {'num_samples': 1024, 'number_of_characters': 298539, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'pan_Guru-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 295385, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'pan_Guru-hin_Deva': {'num_samples': 1024, 'number_of_characters': 306051, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'pan_Guru-kan_Knda': {'num_samples': 1024, 'number_of_characters': 318009, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'pan_Guru-kas_Arab': {'num_samples': 1024, 'number_of_characters': 308709, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'pan_Guru-mai_Deva': {'num_samples': 1024, 'number_of_characters': 294627, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'pan_Guru-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 329581, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'pan_Guru-mar_Deva': {'num_samples': 1024, 'number_of_characters': 307729, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'pan_Guru-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 299079, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'pan_Guru-npi_Deva': {'num_samples': 1024, 'number_of_characters': 299364, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'pan_Guru-ory_Orya': {'num_samples': 1024, 'number_of_characters': 320171, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'pan_Guru-san_Deva': {'num_samples': 1024, 'number_of_characters': 304024, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'pan_Guru-sat_Olck': {'num_samples': 1024, 'number_of_characters': 312677, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'pan_Guru-snd_Deva': {'num_samples': 1024, 'number_of_characters': 306366, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'pan_Guru-tam_Taml': {'num_samples': 1024, 'number_of_characters': 334291, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'pan_Guru-tel_Telu': {'num_samples': 1024, 'number_of_characters': 304990, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'pan_Guru-urd_Arab': {'num_samples': 1024, 'number_of_characters': 301079, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'san_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 318079, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'san_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 307783, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'san_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 320770, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'san_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 316181, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'san_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 317259, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'san_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 309755, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'san_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 306601, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'san_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 317267, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'san_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 329225, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'san_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 319925, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'san_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 305843, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'san_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 340797, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'san_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 318945, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'san_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 310295, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'san_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 310580, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'san_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 331387, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'san_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304024, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'san_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 323893, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'san_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 317582, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'san_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 345507, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'san_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 316206, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'san_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 312295, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'sat_Olck-asm_Beng': {'num_samples': 1024, 'number_of_characters': 326732, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'sat_Olck-ben_Beng': {'num_samples': 1024, 'number_of_characters': 316436, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'sat_Olck-brx_Deva': {'num_samples': 1024, 'number_of_characters': 329423, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'sat_Olck-doi_Deva': {'num_samples': 1024, 'number_of_characters': 324834, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'sat_Olck-eng_Latn': {'num_samples': 1024, 'number_of_characters': 325912, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'sat_Olck-gom_Deva': {'num_samples': 1024, 'number_of_characters': 318408, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'sat_Olck-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 315254, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'sat_Olck-hin_Deva': {'num_samples': 1024, 'number_of_characters': 325920, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'sat_Olck-kan_Knda': {'num_samples': 1024, 'number_of_characters': 337878, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'sat_Olck-kas_Arab': {'num_samples': 1024, 'number_of_characters': 328578, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'sat_Olck-mai_Deva': {'num_samples': 1024, 'number_of_characters': 314496, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'sat_Olck-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 349450, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'sat_Olck-mar_Deva': {'num_samples': 1024, 'number_of_characters': 327598, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'sat_Olck-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 318948, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'sat_Olck-npi_Deva': {'num_samples': 1024, 'number_of_characters': 319233, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'sat_Olck-ory_Orya': {'num_samples': 1024, 'number_of_characters': 340040, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'sat_Olck-pan_Guru': {'num_samples': 1024, 'number_of_characters': 312677, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'sat_Olck-san_Deva': {'num_samples': 1024, 'number_of_characters': 323893, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'sat_Olck-snd_Deva': {'num_samples': 1024, 'number_of_characters': 326235, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'sat_Olck-tam_Taml': {'num_samples': 1024, 'number_of_characters': 354160, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'sat_Olck-tel_Telu': {'num_samples': 1024, 'number_of_characters': 324859, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'sat_Olck-urd_Arab': {'num_samples': 1024, 'number_of_characters': 320948, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'snd_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320421, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'snd_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 310125, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'snd_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 323112, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'snd_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318523, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'snd_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 319601, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'snd_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 312097, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'snd_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308943, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'snd_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 319609, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'snd_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331567, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'snd_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 322267, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'snd_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 308185, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'snd_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 343139, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'snd_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 321287, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'snd_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312637, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'snd_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312922, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'snd_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333729, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'snd_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306366, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'snd_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 317582, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'snd_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 326235, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'snd_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347849, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'snd_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318548, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'snd_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314637, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'tam_Taml-asm_Beng': {'num_samples': 1024, 'number_of_characters': 348346, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'tam_Taml-ben_Beng': {'num_samples': 1024, 'number_of_characters': 338050, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'tam_Taml-brx_Deva': {'num_samples': 1024, 'number_of_characters': 351037, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'tam_Taml-doi_Deva': {'num_samples': 1024, 'number_of_characters': 346448, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'tam_Taml-eng_Latn': {'num_samples': 1024, 'number_of_characters': 347526, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'tam_Taml-gom_Deva': {'num_samples': 1024, 'number_of_characters': 340022, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'tam_Taml-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 336868, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'tam_Taml-hin_Deva': {'num_samples': 1024, 'number_of_characters': 347534, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'tam_Taml-kan_Knda': {'num_samples': 1024, 'number_of_characters': 359492, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'tam_Taml-kas_Arab': {'num_samples': 1024, 'number_of_characters': 350192, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'tam_Taml-mai_Deva': {'num_samples': 1024, 'number_of_characters': 336110, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'tam_Taml-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 371064, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'tam_Taml-mar_Deva': {'num_samples': 1024, 'number_of_characters': 349212, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'tam_Taml-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 340562, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'tam_Taml-npi_Deva': {'num_samples': 1024, 'number_of_characters': 340847, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'tam_Taml-ory_Orya': {'num_samples': 1024, 'number_of_characters': 361654, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'tam_Taml-pan_Guru': {'num_samples': 1024, 'number_of_characters': 334291, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'tam_Taml-san_Deva': {'num_samples': 1024, 'number_of_characters': 345507, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'tam_Taml-sat_Olck': {'num_samples': 1024, 'number_of_characters': 354160, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'tam_Taml-snd_Deva': {'num_samples': 1024, 'number_of_characters': 347849, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'tam_Taml-tel_Telu': {'num_samples': 1024, 'number_of_characters': 346473, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'tam_Taml-urd_Arab': {'num_samples': 1024, 'number_of_characters': 342562, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'tel_Telu-asm_Beng': {'num_samples': 1024, 'number_of_characters': 319045, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'tel_Telu-ben_Beng': {'num_samples': 1024, 'number_of_characters': 308749, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'tel_Telu-brx_Deva': {'num_samples': 1024, 'number_of_characters': 321736, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'tel_Telu-doi_Deva': {'num_samples': 1024, 'number_of_characters': 317147, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'tel_Telu-eng_Latn': {'num_samples': 1024, 'number_of_characters': 318225, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'tel_Telu-gom_Deva': {'num_samples': 1024, 'number_of_characters': 310721, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'tel_Telu-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 307567, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'tel_Telu-hin_Deva': {'num_samples': 1024, 'number_of_characters': 318233, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'tel_Telu-kan_Knda': {'num_samples': 1024, 'number_of_characters': 330191, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'tel_Telu-kas_Arab': {'num_samples': 1024, 'number_of_characters': 320891, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'tel_Telu-mai_Deva': {'num_samples': 1024, 'number_of_characters': 306809, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'tel_Telu-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 341763, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'tel_Telu-mar_Deva': {'num_samples': 1024, 'number_of_characters': 319911, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'tel_Telu-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 311261, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'tel_Telu-npi_Deva': {'num_samples': 1024, 'number_of_characters': 311546, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'tel_Telu-ory_Orya': {'num_samples': 1024, 'number_of_characters': 332353, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'tel_Telu-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304990, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'tel_Telu-san_Deva': {'num_samples': 1024, 'number_of_characters': 316206, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'tel_Telu-sat_Olck': {'num_samples': 1024, 'number_of_characters': 324859, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'tel_Telu-snd_Deva': {'num_samples': 1024, 'number_of_characters': 318548, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'tel_Telu-tam_Taml': {'num_samples': 1024, 'number_of_characters': 346473, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'tel_Telu-urd_Arab': {'num_samples': 1024, 'number_of_characters': 313261, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'urd_Arab-asm_Beng': {'num_samples': 1024, 'number_of_characters': 315134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'urd_Arab-ben_Beng': {'num_samples': 1024, 'number_of_characters': 304838, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'urd_Arab-brx_Deva': {'num_samples': 1024, 'number_of_characters': 317825, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'urd_Arab-doi_Deva': {'num_samples': 1024, 'number_of_characters': 313236, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'urd_Arab-eng_Latn': {'num_samples': 1024, 'number_of_characters': 314314, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'urd_Arab-gom_Deva': {'num_samples': 1024, 'number_of_characters': 306810, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'urd_Arab-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 303656, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'urd_Arab-hin_Deva': {'num_samples': 1024, 'number_of_characters': 314322, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'urd_Arab-kan_Knda': {'num_samples': 1024, 'number_of_characters': 326280, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'urd_Arab-kas_Arab': {'num_samples': 1024, 'number_of_characters': 316980, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'urd_Arab-mai_Deva': {'num_samples': 1024, 'number_of_characters': 302898, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'urd_Arab-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 337852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'urd_Arab-mar_Deva': {'num_samples': 1024, 'number_of_characters': 316000, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'urd_Arab-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 307350, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'urd_Arab-npi_Deva': {'num_samples': 1024, 'number_of_characters': 307635, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'urd_Arab-ory_Orya': {'num_samples': 1024, 'number_of_characters': 328442, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'urd_Arab-pan_Guru': {'num_samples': 1024, 'number_of_characters': 301079, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'urd_Arab-san_Deva': {'num_samples': 1024, 'number_of_characters': 312295, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'urd_Arab-sat_Olck': {'num_samples': 1024, 'number_of_characters': 320948, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'urd_Arab-snd_Deva': {'num_samples': 1024, 'number_of_characters': 314637, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'urd_Arab-tam_Taml': {'num_samples': 1024, 'number_of_characters': 342562, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'urd_Arab-tel_Telu': {'num_samples': 1024, 'number_of_characters': 313261, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}}}} | | [IWSLT2017BitextMining](https://aclanthology.org/2017.iwslt-1.1/) (Cettolo et al., 2017) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'jpn', 'kor', 'nld', 'ron'] | BitextMining | s2s | [Fiction, Non-fiction, Written] | {'validation': 21938} | {'validation': {'num_samples': 21938, 'number_of_characters': 4256244, 'unique_pairs': 21840, 'min_sentence1_length': 2, 'average_sentence1_length': 97.01, 'max_sentence1_length': 521, 'unique_sentence1': 11563, 'min_sentence2_length': 2, 'average_sentence2_length': 97.01, 'max_sentence2_length': 521, 'unique_sentence2': 11563, 'hf_subset_descriptive_stats': {'ar-en': {'num_samples': 888, 'number_of_characters': 172499, 'unique_pairs': 887, 'min_sentence1_length': 4, 'average_sentence1_length': 85.49, 'max_sentence1_length': 369, 'unique_sentence1': 887, 'min_sentence2_length': 10, 'average_sentence2_length': 108.77, 'max_sentence2_length': 462, 'unique_sentence2': 881}, 'de-en': {'num_samples': 888, 'number_of_characters': 202336, 'unique_pairs': 883, 'min_sentence1_length': 6, 'average_sentence1_length': 119.03, 'max_sentence1_length': 521, 'unique_sentence1': 881, 'min_sentence2_length': 10, 'average_sentence2_length': 108.83, 'max_sentence2_length': 462, 'unique_sentence2': 881}, 'en-ar': {'num_samples': 888, 'number_of_characters': 172499, 'unique_pairs': 887, 'min_sentence1_length': 10, 'average_sentence1_length': 108.77, 'max_sentence1_length': 462, 'unique_sentence1': 881, 'min_sentence2_length': 4, 'average_sentence2_length': 85.49, 'max_sentence2_length': 369, 'unique_sentence2': 887}, 'en-de': {'num_samples': 888, 'number_of_characters': 202336, 'unique_pairs': 883, 'min_sentence1_length': 10, 'average_sentence1_length': 108.83, 'max_sentence1_length': 462, 'unique_sentence1': 881, 'min_sentence2_length': 6, 'average_sentence2_length': 119.03, 'max_sentence2_length': 521, 'unique_sentence2': 881}, 'en-fr': {'num_samples': 890, 'number_of_characters': 197619, 'unique_pairs': 883, 'min_sentence1_length': 10, 'average_sentence1_length': 108.41, 'max_sentence1_length': 462, 'unique_sentence1': 883, 'min_sentence2_length': 6, 'average_sentence2_length': 113.63, 'max_sentence2_length': 493, 'unique_sentence2': 881}, 'en-it': {'num_samples': 929, 'number_of_characters': 191803, 'unique_pairs': 924, 'min_sentence1_length': 10, 'average_sentence1_length': 103.0, 'max_sentence1_length': 433, 'unique_sentence1': 922, 'min_sentence2_length': 7, 'average_sentence2_length': 103.46, 'max_sentence2_length': 444, 'unique_sentence2': 918}, 'en-ja': {'num_samples': 871, 'number_of_characters': 132742, 'unique_pairs': 867, 'min_sentence1_length': 10, 'average_sentence1_length': 109.81, 'max_sentence1_length': 462, 'unique_sentence1': 864, 'min_sentence2_length': 5, 'average_sentence2_length': 42.59, 'max_sentence2_length': 225, 'unique_sentence2': 866}, 'en-ko': {'num_samples': 879, 'number_of_characters': 142659, 'unique_pairs': 874, 'min_sentence1_length': 10, 'average_sentence1_length': 107.74, 'max_sentence1_length': 462, 'unique_sentence1': 872, 'min_sentence2_length': 3, 'average_sentence2_length': 54.56, 'max_sentence2_length': 250, 'unique_sentence2': 872}, 'en-nl': {'num_samples': 1003, 'number_of_characters': 189637, 'unique_pairs': 1000, 'min_sentence1_length': 10, 'average_sentence1_length': 95.27, 'max_sentence1_length': 433, 'unique_sentence1': 996, 'min_sentence2_length': 4, 'average_sentence2_length': 93.8, 'max_sentence2_length': 477, 'unique_sentence2': 1000}, 'en-ro': {'num_samples': 914, 'number_of_characters': 194128, 'unique_pairs': 910, 'min_sentence1_length': 10, 'average_sentence1_length': 104.72, 'max_sentence1_length': 433, 'unique_sentence1': 907, 'min_sentence2_length': 9, 'average_sentence2_length': 107.67, 'max_sentence2_length': 448, 'unique_sentence2': 910}, 'en-zh': {'num_samples': 879, 'number_of_characters': 131126, 'unique_pairs': 877, 'min_sentence1_length': 10, 'average_sentence1_length': 109.37, 'max_sentence1_length': 462, 'unique_sentence1': 872, 'min_sentence2_length': 2, 'average_sentence2_length': 39.81, 'max_sentence2_length': 230, 'unique_sentence2': 867}, 'fr-en': {'num_samples': 890, 'number_of_characters': 197619, 'unique_pairs': 883, 'min_sentence1_length': 6, 'average_sentence1_length': 113.63, 'max_sentence1_length': 493, 'unique_sentence1': 881, 'min_sentence2_length': 10, 'average_sentence2_length': 108.41, 'max_sentence2_length': 462, 'unique_sentence2': 883}, 'it-en': {'num_samples': 929, 'number_of_characters': 191803, 'unique_pairs': 924, 'min_sentence1_length': 7, 'average_sentence1_length': 103.46, 'max_sentence1_length': 444, 'unique_sentence1': 918, 'min_sentence2_length': 10, 'average_sentence2_length': 103.0, 'max_sentence2_length': 433, 'unique_sentence2': 922}, 'it-nl': {'num_samples': 1001, 'number_of_characters': 188858, 'unique_pairs': 998, 'min_sentence1_length': 7, 'average_sentence1_length': 94.64, 'max_sentence1_length': 459, 'unique_sentence1': 994, 'min_sentence2_length': 7, 'average_sentence2_length': 94.03, 'max_sentence2_length': 505, 'unique_sentence2': 998}, 'it-ro': {'num_samples': 914, 'number_of_characters': 193339, 'unique_pairs': 911, 'min_sentence1_length': 7, 'average_sentence1_length': 103.91, 'max_sentence1_length': 435, 'unique_sentence1': 907, 'min_sentence2_length': 9, 'average_sentence2_length': 107.62, 'max_sentence2_length': 448, 'unique_sentence2': 910}, 'ja-en': {'num_samples': 871, 'number_of_characters': 132742, 'unique_pairs': 867, 'min_sentence1_length': 5, 'average_sentence1_length': 42.59, 'max_sentence1_length': 225, 'unique_sentence1': 866, 'min_sentence2_length': 10, 'average_sentence2_length': 109.81, 'max_sentence2_length': 462, 'unique_sentence2': 864}, 'ko-en': {'num_samples': 879, 'number_of_characters': 142659, 'unique_pairs': 874, 'min_sentence1_length': 3, 'average_sentence1_length': 54.56, 'max_sentence1_length': 250, 'unique_sentence1': 872, 'min_sentence2_length': 10, 'average_sentence2_length': 107.74, 'max_sentence2_length': 462, 'unique_sentence2': 872}, 'nl-en': {'num_samples': 1003, 'number_of_characters': 189637, 'unique_pairs': 1000, 'min_sentence1_length': 4, 'average_sentence1_length': 93.8, 'max_sentence1_length': 477, 'unique_sentence1': 1000, 'min_sentence2_length': 10, 'average_sentence2_length': 95.27, 'max_sentence2_length': 433, 'unique_sentence2': 996}, 'nl-it': {'num_samples': 1001, 'number_of_characters': 188858, 'unique_pairs': 998, 'min_sentence1_length': 7, 'average_sentence1_length': 94.03, 'max_sentence1_length': 505, 'unique_sentence1': 998, 'min_sentence2_length': 7, 'average_sentence2_length': 94.64, 'max_sentence2_length': 459, 'unique_sentence2': 994}, 'nl-ro': {'num_samples': 913, 'number_of_characters': 191376, 'unique_pairs': 911, 'min_sentence1_length': 7, 'average_sentence1_length': 102.02, 'max_sentence1_length': 478, 'unique_sentence1': 909, 'min_sentence2_length': 9, 'average_sentence2_length': 107.59, 'max_sentence2_length': 515, 'unique_sentence2': 909}, 'ro-en': {'num_samples': 914, 'number_of_characters': 194128, 'unique_pairs': 910, 'min_sentence1_length': 9, 'average_sentence1_length': 107.67, 'max_sentence1_length': 448, 'unique_sentence1': 910, 'min_sentence2_length': 10, 'average_sentence2_length': 104.72, 'max_sentence2_length': 433, 'unique_sentence2': 907}, 'ro-it': {'num_samples': 914, 'number_of_characters': 193339, 'unique_pairs': 911, 'min_sentence1_length': 9, 'average_sentence1_length': 107.62, 'max_sentence1_length': 448, 'unique_sentence1': 910, 'min_sentence2_length': 7, 'average_sentence2_length': 103.91, 'max_sentence2_length': 435, 'unique_sentence2': 907}, 'ro-nl': {'num_samples': 913, 'number_of_characters': 191376, 'unique_pairs': 911, 'min_sentence1_length': 9, 'average_sentence1_length': 107.59, 'max_sentence1_length': 515, 'unique_sentence1': 909, 'min_sentence2_length': 7, 'average_sentence2_length': 102.02, 'max_sentence2_length': 478, 'unique_sentence2': 909}, 'zh-en': {'num_samples': 879, 'number_of_characters': 131126, 'unique_pairs': 877, 'min_sentence1_length': 2, 'average_sentence1_length': 39.81, 'max_sentence1_length': 230, 'unique_sentence1': 867, 'min_sentence2_length': 10, 'average_sentence2_length': 109.37, 'max_sentence2_length': 462, 'unique_sentence2': 872}}}} | @@ -358,8 +393,9 @@ The following tables give you an overview of the tasks in MTEB. | [ImageNetDog15Clustering](http://vision.stanford.edu/aditya86/ImageNetDogs/main.html) (Deng et al., 2009) | ['eng'] | ImageClustering | i2i | [Web] | {'test': 1076} | {'test': {'num_samples': 1076, 'unique_num_labels': 15, 'min_image_width': 224, 'average_image_width': 224.0, 'max_image_width': 224, 'min_image_height': 224, 'average_image_height': 224.0, 'max_image_height': 224, 'labels': {'0': {'count': 152}, '1': {'count': 88}, '2': {'count': 75}, '3': {'count': 96}, '4': {'count': 57}, '5': {'count': 50}, '6': {'count': 52}, '7': {'count': 50}, '8': {'count': 50}, '9': {'count': 50}, '10': {'count': 53}, '11': {'count': 57}, '12': {'count': 50}, '13': {'count': 100}, '14': {'count': 96}}}} | | [Imagenet1k](https://ieeexplore.ieee.org/document/5206848) (Deng et al., 2009) | ['eng'] | ImageClassification | i2i | [Scene] | {'test': 50000} | {'test': {'num_samples': 50000, 'unique_num_labels': 1000, 'min_image_width': 54, 'average_image_width': 490.4, 'max_image_width': 4288, 'min_image_height': 56, 'average_image_height': 430.24, 'max_image_height': 5005, 'labels': {'0': {'count': 50}, '1': {'count': 50}, '2': {'count': 50}, '3': {'count': 50}, '4': {'count': 50}, '5': {'count': 50}, '6': {'count': 50}, '7': {'count': 50}, '8': {'count': 50}, '9': {'count': 50}, '10': {'count': 50}, '11': {'count': 50}, '12': {'count': 50}, '13': {'count': 50}, '14': {'count': 50}, '15': {'count': 50}, '16': {'count': 50}, '17': {'count': 50}, '18': {'count': 50}, '19': {'count': 50}, '20': {'count': 50}, '21': {'count': 50}, '22': {'count': 50}, '23': {'count': 50}, '24': {'count': 50}, '25': {'count': 50}, '26': {'count': 50}, '27': {'count': 50}, '28': {'count': 50}, '29': {'count': 50}, '30': {'count': 50}, '31': {'count': 50}, '32': {'count': 50}, '33': {'count': 50}, '34': {'count': 50}, '35': {'count': 50}, '36': {'count': 50}, '37': {'count': 50}, '38': {'count': 50}, '39': {'count': 50}, '40': {'count': 50}, '41': {'count': 50}, '42': {'count': 50}, '43': {'count': 50}, '44': {'count': 50}, '45': {'count': 50}, '46': {'count': 50}, '47': {'count': 50}, '48': {'count': 50}, '49': {'count': 50}, '50': {'count': 50}, '51': {'count': 50}, '52': {'count': 50}, '53': {'count': 50}, '54': {'count': 50}, '55': {'count': 50}, '56': {'count': 50}, '57': {'count': 50}, '58': {'count': 50}, '59': {'count': 50}, '60': {'count': 50}, '61': {'count': 50}, '62': {'count': 50}, '63': {'count': 50}, '64': {'count': 50}, '65': {'count': 50}, '66': {'count': 50}, '67': {'count': 50}, '68': {'count': 50}, '69': {'count': 50}, '70': {'count': 50}, '71': {'count': 50}, '72': {'count': 50}, '73': {'count': 50}, '74': {'count': 50}, '75': {'count': 50}, '76': {'count': 50}, '77': {'count': 50}, '78': {'count': 50}, '79': {'count': 50}, '80': {'count': 50}, '81': {'count': 50}, '82': {'count': 50}, '83': {'count': 50}, '84': {'count': 50}, '85': {'count': 50}, '86': {'count': 50}, '87': {'count': 50}, '88': {'count': 50}, '89': {'count': 50}, '90': {'count': 50}, '91': {'count': 50}, '92': {'count': 50}, '93': {'count': 50}, '94': {'count': 50}, '95': {'count': 50}, '96': {'count': 50}, '97': {'count': 50}, '98': {'count': 50}, '99': {'count': 50}, '100': {'count': 50}, '101': {'count': 50}, '102': {'count': 50}, '103': {'count': 50}, '104': {'count': 50}, '105': {'count': 50}, '106': {'count': 50}, '107': {'count': 50}, '108': {'count': 50}, '109': {'count': 50}, '110': {'count': 50}, '111': {'count': 50}, '112': {'count': 50}, '113': {'count': 50}, '114': {'count': 50}, '115': {'count': 50}, '116': {'count': 50}, '117': {'count': 50}, '118': {'count': 50}, '119': {'count': 50}, '120': {'count': 50}, '121': {'count': 50}, '122': {'count': 50}, '123': {'count': 50}, '124': {'count': 50}, '125': {'count': 50}, '126': {'count': 50}, '127': {'count': 50}, '128': {'count': 50}, '129': {'count': 50}, '130': {'count': 50}, '131': {'count': 50}, '132': {'count': 50}, '133': {'count': 50}, '134': {'count': 50}, '135': {'count': 50}, '136': {'count': 50}, '137': {'count': 50}, '138': {'count': 50}, '139': {'count': 50}, '140': {'count': 50}, '141': {'count': 50}, '142': {'count': 50}, '143': {'count': 50}, '144': {'count': 50}, '145': {'count': 50}, '146': {'count': 50}, '147': {'count': 50}, '148': {'count': 50}, '149': {'count': 50}, '150': {'count': 50}, '151': {'count': 50}, '152': {'count': 50}, '153': {'count': 50}, '154': {'count': 50}, '155': {'count': 50}, '156': {'count': 50}, '157': {'count': 50}, '158': {'count': 50}, '159': {'count': 50}, '160': {'count': 50}, '161': {'count': 50}, '162': {'count': 50}, '163': {'count': 50}, '164': {'count': 50}, '165': {'count': 50}, '166': {'count': 50}, '167': {'count': 50}, '168': {'count': 50}, '169': {'count': 50}, '170': {'count': 50}, '171': {'count': 50}, '172': {'count': 50}, '173': {'count': 50}, '174': {'count': 50}, '175': {'count': 50}, '176': {'count': 50}, '177': {'count': 50}, '178': {'count': 50}, '179': {'count': 50}, '180': {'count': 50}, '181': {'count': 50}, '182': {'count': 50}, '183': {'count': 50}, '184': {'count': 50}, '185': {'count': 50}, '186': {'count': 50}, '187': {'count': 50}, '188': {'count': 50}, '189': {'count': 50}, '190': {'count': 50}, '191': {'count': 50}, '192': {'count': 50}, '193': {'count': 50}, '194': {'count': 50}, '195': {'count': 50}, '196': {'count': 50}, '197': {'count': 50}, '198': {'count': 50}, '199': {'count': 50}, '200': {'count': 50}, '201': {'count': 50}, '202': {'count': 50}, '203': {'count': 50}, '204': {'count': 50}, '205': {'count': 50}, '206': {'count': 50}, '207': {'count': 50}, '208': {'count': 50}, '209': {'count': 50}, '210': {'count': 50}, '211': {'count': 50}, '212': {'count': 50}, '213': {'count': 50}, '214': {'count': 50}, '215': {'count': 50}, '216': {'count': 50}, '217': {'count': 50}, '218': {'count': 50}, '219': {'count': 50}, '220': {'count': 50}, '221': {'count': 50}, '222': {'count': 50}, '223': {'count': 50}, '224': {'count': 50}, '225': {'count': 50}, '226': {'count': 50}, '227': {'count': 50}, '228': {'count': 50}, '229': {'count': 50}, '230': {'count': 50}, '231': {'count': 50}, '232': {'count': 50}, '233': {'count': 50}, '234': {'count': 50}, '235': {'count': 50}, '236': {'count': 50}, '237': {'count': 50}, '238': {'count': 50}, '239': {'count': 50}, '240': {'count': 50}, '241': {'count': 50}, '242': {'count': 50}, '243': {'count': 50}, '244': {'count': 50}, '245': {'count': 50}, '246': {'count': 50}, '247': {'count': 50}, '248': {'count': 50}, '249': {'count': 50}, '250': {'count': 50}, '251': {'count': 50}, '252': {'count': 50}, '253': {'count': 50}, '254': {'count': 50}, '255': {'count': 50}, '256': {'count': 50}, '257': {'count': 50}, '258': {'count': 50}, '259': {'count': 50}, '260': {'count': 50}, '261': {'count': 50}, '262': {'count': 50}, '263': {'count': 50}, '264': {'count': 50}, '265': {'count': 50}, '266': {'count': 50}, '267': {'count': 50}, '268': {'count': 50}, '269': {'count': 50}, '270': {'count': 50}, '271': {'count': 50}, '272': {'count': 50}, '273': {'count': 50}, '274': {'count': 50}, '275': {'count': 50}, '276': {'count': 50}, '277': {'count': 50}, '278': {'count': 50}, '279': {'count': 50}, '280': {'count': 50}, '281': {'count': 50}, '282': {'count': 50}, '283': {'count': 50}, '284': {'count': 50}, '285': {'count': 50}, '286': {'count': 50}, '287': {'count': 50}, '288': {'count': 50}, '289': {'count': 50}, '290': {'count': 50}, '291': {'count': 50}, '292': {'count': 50}, '293': {'count': 50}, '294': {'count': 50}, '295': {'count': 50}, '296': {'count': 50}, '297': {'count': 50}, '298': {'count': 50}, '299': {'count': 50}, '300': {'count': 50}, '301': {'count': 50}, '302': {'count': 50}, '303': {'count': 50}, '304': {'count': 50}, '305': {'count': 50}, '306': {'count': 50}, '307': {'count': 50}, '308': {'count': 50}, '309': {'count': 50}, '310': {'count': 50}, '311': {'count': 50}, '312': {'count': 50}, '313': {'count': 50}, '314': {'count': 50}, '315': {'count': 50}, '316': {'count': 50}, '317': {'count': 50}, '318': {'count': 50}, '319': {'count': 50}, '320': {'count': 50}, '321': {'count': 50}, '322': {'count': 50}, '323': {'count': 50}, '324': {'count': 50}, '325': {'count': 50}, '326': {'count': 50}, '327': {'count': 50}, '328': {'count': 50}, '329': {'count': 50}, '330': {'count': 50}, '331': {'count': 50}, '332': {'count': 50}, '333': {'count': 50}, '334': {'count': 50}, '335': {'count': 50}, '336': {'count': 50}, '337': {'count': 50}, '338': {'count': 50}, '339': {'count': 50}, '340': {'count': 50}, '341': {'count': 50}, '342': {'count': 50}, '343': {'count': 50}, '344': {'count': 50}, '345': {'count': 50}, '346': {'count': 50}, '347': {'count': 50}, '348': {'count': 50}, '349': {'count': 50}, '350': {'count': 50}, '351': {'count': 50}, '352': {'count': 50}, '353': {'count': 50}, '354': {'count': 50}, '355': {'count': 50}, '356': {'count': 50}, '357': {'count': 50}, '358': {'count': 50}, '359': {'count': 50}, '360': {'count': 50}, '361': {'count': 50}, '362': {'count': 50}, '363': {'count': 50}, '364': {'count': 50}, '365': {'count': 50}, '366': {'count': 50}, '367': {'count': 50}, '368': {'count': 50}, '369': {'count': 50}, '370': {'count': 50}, '371': {'count': 50}, '372': {'count': 50}, '373': {'count': 50}, '374': {'count': 50}, '375': {'count': 50}, '376': {'count': 50}, '377': {'count': 50}, '378': {'count': 50}, '379': {'count': 50}, '380': {'count': 50}, '381': {'count': 50}, '382': {'count': 50}, '383': {'count': 50}, '384': {'count': 50}, '385': {'count': 50}, '386': {'count': 50}, '387': {'count': 50}, '388': {'count': 50}, '389': {'count': 50}, '390': {'count': 50}, '391': {'count': 50}, '392': {'count': 50}, '393': {'count': 50}, '394': {'count': 50}, '395': {'count': 50}, '396': {'count': 50}, '397': {'count': 50}, '398': {'count': 50}, '399': {'count': 50}, '400': {'count': 50}, '401': {'count': 50}, '402': {'count': 50}, '403': {'count': 50}, '404': {'count': 50}, '405': {'count': 50}, '406': {'count': 50}, '407': {'count': 50}, '408': {'count': 50}, '409': {'count': 50}, '410': {'count': 50}, '411': {'count': 50}, '412': {'count': 50}, '413': {'count': 50}, '414': {'count': 50}, '415': {'count': 50}, '416': {'count': 50}, '417': {'count': 50}, '418': {'count': 50}, '419': {'count': 50}, '420': {'count': 50}, '421': {'count': 50}, '422': {'count': 50}, '423': {'count': 50}, '424': {'count': 50}, '425': {'count': 50}, '426': {'count': 50}, '427': {'count': 50}, '428': {'count': 50}, '429': {'count': 50}, '430': {'count': 50}, '431': {'count': 50}, '432': {'count': 50}, '433': {'count': 50}, '434': {'count': 50}, '435': {'count': 50}, '436': {'count': 50}, '437': {'count': 50}, '438': {'count': 50}, '439': {'count': 50}, '440': {'count': 50}, '441': {'count': 50}, '442': {'count': 50}, '443': {'count': 50}, '444': {'count': 50}, '445': {'count': 50}, '446': {'count': 50}, '447': {'count': 50}, '448': {'count': 50}, '449': {'count': 50}, '450': {'count': 50}, '451': {'count': 50}, '452': {'count': 50}, '453': {'count': 50}, '454': {'count': 50}, '455': {'count': 50}, '456': {'count': 50}, '457': {'count': 50}, '458': {'count': 50}, '459': {'count': 50}, '460': {'count': 50}, '461': {'count': 50}, '462': {'count': 50}, '463': {'count': 50}, '464': {'count': 50}, '465': {'count': 50}, '466': {'count': 50}, '467': {'count': 50}, '468': {'count': 50}, '469': {'count': 50}, '470': {'count': 50}, '471': {'count': 50}, '472': {'count': 50}, '473': {'count': 50}, '474': {'count': 50}, '475': {'count': 50}, '476': {'count': 50}, '477': {'count': 50}, '478': {'count': 50}, '479': {'count': 50}, '480': {'count': 50}, '481': {'count': 50}, '482': {'count': 50}, '483': {'count': 50}, '484': {'count': 50}, '485': {'count': 50}, '486': {'count': 50}, '487': {'count': 50}, '488': {'count': 50}, '489': {'count': 50}, '490': {'count': 50}, '491': {'count': 50}, '492': {'count': 50}, '493': {'count': 50}, '494': {'count': 50}, '495': {'count': 50}, '496': {'count': 50}, '497': {'count': 50}, '498': {'count': 50}, '499': {'count': 50}, '500': {'count': 50}, '501': {'count': 50}, '502': {'count': 50}, '503': {'count': 50}, '504': {'count': 50}, '505': {'count': 50}, '506': {'count': 50}, '507': {'count': 50}, '508': {'count': 50}, '509': {'count': 50}, '510': {'count': 50}, '511': {'count': 50}, '512': {'count': 50}, '513': {'count': 50}, '514': {'count': 50}, '515': {'count': 50}, '516': {'count': 50}, '517': {'count': 50}, '518': {'count': 50}, '519': {'count': 50}, '520': {'count': 50}, '521': {'count': 50}, '522': {'count': 50}, '523': {'count': 50}, '524': {'count': 50}, '525': {'count': 50}, '526': {'count': 50}, '527': {'count': 50}, '528': {'count': 50}, '529': {'count': 50}, '530': {'count': 50}, '531': {'count': 50}, '532': {'count': 50}, '533': {'count': 50}, '534': {'count': 50}, '535': {'count': 50}, '536': {'count': 50}, '537': {'count': 50}, '538': {'count': 50}, '539': {'count': 50}, '540': {'count': 50}, '541': {'count': 50}, '542': {'count': 50}, '543': {'count': 50}, '544': {'count': 50}, '545': {'count': 50}, '546': {'count': 50}, '547': {'count': 50}, '548': {'count': 50}, '549': {'count': 50}, '550': {'count': 50}, '551': {'count': 50}, '552': {'count': 50}, '553': {'count': 50}, '554': {'count': 50}, '555': {'count': 50}, '556': {'count': 50}, '557': {'count': 50}, '558': {'count': 50}, '559': {'count': 50}, '560': {'count': 50}, '561': {'count': 50}, '562': {'count': 50}, '563': {'count': 50}, '564': {'count': 50}, '565': {'count': 50}, '566': {'count': 50}, '567': {'count': 50}, '568': {'count': 50}, '569': {'count': 50}, '570': {'count': 50}, '571': {'count': 50}, '572': {'count': 50}, '573': {'count': 50}, '574': {'count': 50}, '575': {'count': 50}, '576': {'count': 50}, '577': {'count': 50}, '578': {'count': 50}, '579': {'count': 50}, '580': {'count': 50}, '581': {'count': 50}, '582': {'count': 50}, '583': {'count': 50}, '584': {'count': 50}, '585': {'count': 50}, '586': {'count': 50}, '587': {'count': 50}, '588': {'count': 50}, '589': {'count': 50}, '590': {'count': 50}, '591': {'count': 50}, '592': {'count': 50}, '593': {'count': 50}, '594': {'count': 50}, '595': {'count': 50}, '596': {'count': 50}, '597': {'count': 50}, '598': {'count': 50}, '599': {'count': 50}, '600': {'count': 50}, '601': {'count': 50}, '602': {'count': 50}, '603': {'count': 50}, '604': {'count': 50}, '605': {'count': 50}, '606': {'count': 50}, '607': {'count': 50}, '608': {'count': 50}, '609': {'count': 50}, '610': {'count': 50}, '611': {'count': 50}, '612': {'count': 50}, '613': {'count': 50}, '614': {'count': 50}, '615': {'count': 50}, '616': {'count': 50}, '617': {'count': 50}, '618': {'count': 50}, '619': {'count': 50}, '620': {'count': 50}, '621': {'count': 50}, '622': {'count': 50}, '623': {'count': 50}, '624': {'count': 50}, '625': {'count': 50}, '626': {'count': 50}, '627': {'count': 50}, '628': {'count': 50}, '629': {'count': 50}, '630': {'count': 50}, '631': {'count': 50}, '632': {'count': 50}, '633': {'count': 50}, '634': {'count': 50}, '635': {'count': 50}, '636': {'count': 50}, '637': {'count': 50}, '638': {'count': 50}, '639': {'count': 50}, '640': {'count': 50}, '641': {'count': 50}, '642': {'count': 50}, '643': {'count': 50}, '644': {'count': 50}, '645': {'count': 50}, '646': {'count': 50}, '647': {'count': 50}, '648': {'count': 50}, '649': {'count': 50}, '650': {'count': 50}, '651': {'count': 50}, '652': {'count': 50}, '653': {'count': 50}, '654': {'count': 50}, '655': {'count': 50}, '656': {'count': 50}, '657': {'count': 50}, '658': {'count': 50}, '659': {'count': 50}, '660': {'count': 50}, '661': {'count': 50}, '662': {'count': 50}, '663': {'count': 50}, '664': {'count': 50}, '665': {'count': 50}, '666': {'count': 50}, '667': {'count': 50}, '668': {'count': 50}, '669': {'count': 50}, '670': {'count': 50}, '671': {'count': 50}, '672': {'count': 50}, '673': {'count': 50}, '674': {'count': 50}, '675': {'count': 50}, '676': {'count': 50}, '677': {'count': 50}, '678': {'count': 50}, '679': {'count': 50}, '680': {'count': 50}, '681': {'count': 50}, '682': {'count': 50}, '683': {'count': 50}, '684': {'count': 50}, '685': {'count': 50}, '686': {'count': 50}, '687': {'count': 50}, '688': {'count': 50}, '689': {'count': 50}, '690': {'count': 50}, '691': {'count': 50}, '692': {'count': 50}, '693': {'count': 50}, '694': {'count': 50}, '695': {'count': 50}, '696': {'count': 50}, '697': {'count': 50}, '698': {'count': 50}, '699': {'count': 50}, '700': {'count': 50}, '701': {'count': 50}, '702': {'count': 50}, '703': {'count': 50}, '704': {'count': 50}, '705': {'count': 50}, '706': {'count': 50}, '707': {'count': 50}, '708': {'count': 50}, '709': {'count': 50}, '710': {'count': 50}, '711': {'count': 50}, '712': {'count': 50}, '713': {'count': 50}, '714': {'count': 50}, '715': {'count': 50}, '716': {'count': 50}, '717': {'count': 50}, '718': {'count': 50}, '719': {'count': 50}, '720': {'count': 50}, '721': {'count': 50}, '722': {'count': 50}, '723': {'count': 50}, '724': {'count': 50}, '725': {'count': 50}, '726': {'count': 50}, '727': {'count': 50}, '728': {'count': 50}, '729': {'count': 50}, '730': {'count': 50}, '731': {'count': 50}, '732': {'count': 50}, '733': {'count': 50}, '734': {'count': 50}, '735': {'count': 50}, '736': {'count': 50}, '737': {'count': 50}, '738': {'count': 50}, '739': {'count': 50}, '740': {'count': 50}, '741': {'count': 50}, '742': {'count': 50}, '743': {'count': 50}, '744': {'count': 50}, '745': {'count': 50}, '746': {'count': 50}, '747': {'count': 50}, '748': {'count': 50}, '749': {'count': 50}, '750': {'count': 50}, '751': {'count': 50}, '752': {'count': 50}, '753': {'count': 50}, '754': {'count': 50}, '755': {'count': 50}, '756': {'count': 50}, '757': {'count': 50}, '758': {'count': 50}, '759': {'count': 50}, '760': {'count': 50}, '761': {'count': 50}, '762': {'count': 50}, '763': {'count': 50}, '764': {'count': 50}, '765': {'count': 50}, '766': {'count': 50}, '767': {'count': 50}, '768': {'count': 50}, '769': {'count': 50}, '770': {'count': 50}, '771': {'count': 50}, '772': {'count': 50}, '773': {'count': 50}, '774': {'count': 50}, '775': {'count': 50}, '776': {'count': 50}, '777': {'count': 50}, '778': {'count': 50}, '779': {'count': 50}, '780': {'count': 50}, '781': {'count': 50}, '782': {'count': 50}, '783': {'count': 50}, '784': {'count': 50}, '785': {'count': 50}, '786': {'count': 50}, '787': {'count': 50}, '788': {'count': 50}, '789': {'count': 50}, '790': {'count': 50}, '791': {'count': 50}, '792': {'count': 50}, '793': {'count': 50}, '794': {'count': 50}, '795': {'count': 50}, '796': {'count': 50}, '797': {'count': 50}, '798': {'count': 50}, '799': {'count': 50}, '800': {'count': 50}, '801': {'count': 50}, '802': {'count': 50}, '803': {'count': 50}, '804': {'count': 50}, '805': {'count': 50}, '806': {'count': 50}, '807': {'count': 50}, '808': {'count': 50}, '809': {'count': 50}, '810': {'count': 50}, '811': {'count': 50}, '812': {'count': 50}, '813': {'count': 50}, '814': {'count': 50}, '815': {'count': 50}, '816': {'count': 50}, '817': {'count': 50}, '818': {'count': 50}, '819': {'count': 50}, '820': {'count': 50}, '821': {'count': 50}, '822': {'count': 50}, '823': {'count': 50}, '824': {'count': 50}, '825': {'count': 50}, '826': {'count': 50}, '827': {'count': 50}, '828': {'count': 50}, '829': {'count': 50}, '830': {'count': 50}, '831': {'count': 50}, '832': {'count': 50}, '833': {'count': 50}, '834': {'count': 50}, '835': {'count': 50}, '836': {'count': 50}, '837': {'count': 50}, '838': {'count': 50}, '839': {'count': 50}, '840': {'count': 50}, '841': {'count': 50}, '842': {'count': 50}, '843': {'count': 50}, '844': {'count': 50}, '845': {'count': 50}, '846': {'count': 50}, '847': {'count': 50}, '848': {'count': 50}, '849': {'count': 50}, '850': {'count': 50}, '851': {'count': 50}, '852': {'count': 50}, '853': {'count': 50}, '854': {'count': 50}, '855': {'count': 50}, '856': {'count': 50}, '857': {'count': 50}, '858': {'count': 50}, '859': {'count': 50}, '860': {'count': 50}, '861': {'count': 50}, '862': {'count': 50}, '863': {'count': 50}, '864': {'count': 50}, '865': {'count': 50}, '866': {'count': 50}, '867': {'count': 50}, '868': {'count': 50}, '869': {'count': 50}, '870': {'count': 50}, '871': {'count': 50}, '872': {'count': 50}, '873': {'count': 50}, '874': {'count': 50}, '875': {'count': 50}, '876': {'count': 50}, '877': {'count': 50}, '878': {'count': 50}, '879': {'count': 50}, '880': {'count': 50}, '881': {'count': 50}, '882': {'count': 50}, '883': {'count': 50}, '884': {'count': 50}, '885': {'count': 50}, '886': {'count': 50}, '887': {'count': 50}, '888': {'count': 50}, '889': {'count': 50}, '890': {'count': 50}, '891': {'count': 50}, '892': {'count': 50}, '893': {'count': 50}, '894': {'count': 50}, '895': {'count': 50}, '896': {'count': 50}, '897': {'count': 50}, '898': {'count': 50}, '899': {'count': 50}, '900': {'count': 50}, '901': {'count': 50}, '902': {'count': 50}, '903': {'count': 50}, '904': {'count': 50}, '905': {'count': 50}, '906': {'count': 50}, '907': {'count': 50}, '908': {'count': 50}, '909': {'count': 50}, '910': {'count': 50}, '911': {'count': 50}, '912': {'count': 50}, '913': {'count': 50}, '914': {'count': 50}, '915': {'count': 50}, '916': {'count': 50}, '917': {'count': 50}, '918': {'count': 50}, '919': {'count': 50}, '920': {'count': 50}, '921': {'count': 50}, '922': {'count': 50}, '923': {'count': 50}, '924': {'count': 50}, '925': {'count': 50}, '926': {'count': 50}, '927': {'count': 50}, '928': {'count': 50}, '929': {'count': 50}, '930': {'count': 50}, '931': {'count': 50}, '932': {'count': 50}, '933': {'count': 50}, '934': {'count': 50}, '935': {'count': 50}, '936': {'count': 50}, '937': {'count': 50}, '938': {'count': 50}, '939': {'count': 50}, '940': {'count': 50}, '941': {'count': 50}, '942': {'count': 50}, '943': {'count': 50}, '944': {'count': 50}, '945': {'count': 50}, '946': {'count': 50}, '947': {'count': 50}, '948': {'count': 50}, '949': {'count': 50}, '950': {'count': 50}, '951': {'count': 50}, '952': {'count': 50}, '953': {'count': 50}, '954': {'count': 50}, '955': {'count': 50}, '956': {'count': 50}, '957': {'count': 50}, '958': {'count': 50}, '959': {'count': 50}, '960': {'count': 50}, '961': {'count': 50}, '962': {'count': 50}, '963': {'count': 50}, '964': {'count': 50}, '965': {'count': 50}, '966': {'count': 50}, '967': {'count': 50}, '968': {'count': 50}, '969': {'count': 50}, '970': {'count': 50}, '971': {'count': 50}, '972': {'count': 50}, '973': {'count': 50}, '974': {'count': 50}, '975': {'count': 50}, '976': {'count': 50}, '977': {'count': 50}, '978': {'count': 50}, '979': {'count': 50}, '980': {'count': 50}, '981': {'count': 50}, '982': {'count': 50}, '983': {'count': 50}, '984': {'count': 50}, '985': {'count': 50}, '986': {'count': 50}, '987': {'count': 50}, '988': {'count': 50}, '989': {'count': 50}, '990': {'count': 50}, '991': {'count': 50}, '992': {'count': 50}, '993': {'count': 50}, '994': {'count': 50}, '995': {'count': 50}, '996': {'count': 50}, '997': {'count': 50}, '998': {'count': 50}, '999': {'count': 50}}}} | | [Imagenet1kZeroShot](https://ieeexplore.ieee.org/document/5206848) (Deng et al., 2009) | ['eng'] | ZeroShotClassification | i2t | [Scene] | {'test': 50000} | {'test': {'num_samples': 50000, 'unique_num_labels': 1000, 'min_image_width': 54, 'average_image_width': 490.4, 'max_image_width': 4288, 'min_image_height': 56, 'average_image_height': 430.24, 'max_image_height': 5005, 'min_label_text_length': 15, 'average_label_text_length': 23.81, 'max_label_text_length': 50, 'labels': {'0': {'count': 50}, '1': {'count': 50}, '2': {'count': 50}, '3': {'count': 50}, '4': {'count': 50}, '5': {'count': 50}, '6': {'count': 50}, '7': {'count': 50}, '8': {'count': 50}, '9': {'count': 50}, '10': {'count': 50}, '11': {'count': 50}, '12': {'count': 50}, '13': {'count': 50}, '14': {'count': 50}, '15': {'count': 50}, '16': {'count': 50}, '17': {'count': 50}, '18': {'count': 50}, '19': {'count': 50}, '20': {'count': 50}, '21': {'count': 50}, '22': {'count': 50}, '23': {'count': 50}, '24': {'count': 50}, '25': {'count': 50}, '26': {'count': 50}, '27': {'count': 50}, '28': {'count': 50}, '29': {'count': 50}, '30': {'count': 50}, '31': {'count': 50}, '32': {'count': 50}, '33': {'count': 50}, '34': {'count': 50}, '35': {'count': 50}, '36': {'count': 50}, '37': {'count': 50}, '38': {'count': 50}, '39': {'count': 50}, '40': {'count': 50}, '41': {'count': 50}, '42': {'count': 50}, '43': {'count': 50}, '44': {'count': 50}, '45': {'count': 50}, '46': {'count': 50}, '47': {'count': 50}, '48': {'count': 50}, '49': {'count': 50}, '50': {'count': 50}, '51': {'count': 50}, '52': {'count': 50}, '53': {'count': 50}, '54': {'count': 50}, '55': {'count': 50}, '56': {'count': 50}, '57': {'count': 50}, '58': {'count': 50}, '59': {'count': 50}, '60': {'count': 50}, '61': {'count': 50}, '62': {'count': 50}, '63': {'count': 50}, '64': {'count': 50}, '65': {'count': 50}, '66': {'count': 50}, '67': {'count': 50}, '68': {'count': 50}, '69': {'count': 50}, '70': {'count': 50}, '71': {'count': 50}, '72': {'count': 50}, '73': {'count': 50}, '74': {'count': 50}, '75': {'count': 50}, '76': {'count': 50}, '77': {'count': 50}, '78': {'count': 50}, '79': {'count': 50}, '80': {'count': 50}, '81': {'count': 50}, '82': {'count': 50}, '83': {'count': 50}, '84': {'count': 50}, '85': {'count': 50}, '86': {'count': 50}, '87': {'count': 50}, '88': {'count': 50}, '89': {'count': 50}, '90': {'count': 50}, '91': {'count': 50}, '92': {'count': 50}, '93': {'count': 50}, '94': {'count': 50}, '95': {'count': 50}, '96': {'count': 50}, '97': {'count': 50}, '98': {'count': 50}, '99': {'count': 50}, '100': {'count': 50}, '101': {'count': 50}, '102': {'count': 50}, '103': {'count': 50}, '104': {'count': 50}, '105': {'count': 50}, '106': {'count': 50}, '107': {'count': 50}, '108': {'count': 50}, '109': {'count': 50}, '110': {'count': 50}, '111': {'count': 50}, '112': {'count': 50}, '113': {'count': 50}, '114': {'count': 50}, '115': {'count': 50}, '116': {'count': 50}, '117': {'count': 50}, '118': {'count': 50}, '119': {'count': 50}, '120': {'count': 50}, '121': {'count': 50}, '122': {'count': 50}, '123': {'count': 50}, '124': {'count': 50}, '125': {'count': 50}, '126': {'count': 50}, '127': {'count': 50}, '128': {'count': 50}, '129': {'count': 50}, '130': {'count': 50}, '131': {'count': 50}, '132': {'count': 50}, '133': {'count': 50}, '134': {'count': 50}, '135': {'count': 50}, '136': {'count': 50}, '137': {'count': 50}, '138': {'count': 50}, '139': {'count': 50}, '140': {'count': 50}, '141': {'count': 50}, '142': {'count': 50}, '143': {'count': 50}, '144': {'count': 50}, '145': {'count': 50}, '146': {'count': 50}, '147': {'count': 50}, '148': {'count': 50}, '149': {'count': 50}, '150': {'count': 50}, '151': {'count': 50}, '152': {'count': 50}, '153': {'count': 50}, '154': {'count': 50}, '155': {'count': 50}, '156': {'count': 50}, '157': {'count': 50}, '158': {'count': 50}, '159': {'count': 50}, '160': {'count': 50}, '161': {'count': 50}, '162': {'count': 50}, '163': {'count': 50}, '164': {'count': 50}, '165': {'count': 50}, '166': {'count': 50}, '167': {'count': 50}, '168': {'count': 50}, '169': {'count': 50}, '170': {'count': 50}, '171': {'count': 50}, '172': {'count': 50}, '173': {'count': 50}, '174': {'count': 50}, '175': {'count': 50}, '176': {'count': 50}, '177': {'count': 50}, '178': {'count': 50}, '179': {'count': 50}, '180': {'count': 50}, '181': {'count': 50}, '182': {'count': 50}, '183': {'count': 50}, '184': {'count': 50}, '185': {'count': 50}, '186': {'count': 50}, '187': {'count': 50}, '188': {'count': 50}, '189': {'count': 50}, '190': {'count': 50}, '191': {'count': 50}, '192': {'count': 50}, '193': {'count': 50}, '194': {'count': 50}, '195': {'count': 50}, '196': {'count': 50}, '197': {'count': 50}, '198': {'count': 50}, '199': {'count': 50}, '200': {'count': 50}, '201': {'count': 50}, '202': {'count': 50}, '203': {'count': 50}, '204': {'count': 50}, '205': {'count': 50}, '206': {'count': 50}, '207': {'count': 50}, '208': {'count': 50}, '209': {'count': 50}, '210': {'count': 50}, '211': {'count': 50}, '212': {'count': 50}, '213': {'count': 50}, '214': {'count': 50}, '215': {'count': 50}, '216': {'count': 50}, '217': {'count': 50}, '218': {'count': 50}, '219': {'count': 50}, '220': {'count': 50}, '221': {'count': 50}, '222': {'count': 50}, '223': {'count': 50}, '224': {'count': 50}, '225': {'count': 50}, '226': {'count': 50}, '227': {'count': 50}, '228': {'count': 50}, '229': {'count': 50}, '230': {'count': 50}, '231': {'count': 50}, '232': {'count': 50}, '233': {'count': 50}, '234': {'count': 50}, '235': {'count': 50}, '236': {'count': 50}, '237': {'count': 50}, '238': {'count': 50}, '239': {'count': 50}, '240': {'count': 50}, '241': {'count': 50}, '242': {'count': 50}, '243': {'count': 50}, '244': {'count': 50}, '245': {'count': 50}, '246': {'count': 50}, '247': {'count': 50}, '248': {'count': 50}, '249': {'count': 50}, '250': {'count': 50}, '251': {'count': 50}, '252': {'count': 50}, '253': {'count': 50}, '254': {'count': 50}, '255': {'count': 50}, '256': {'count': 50}, '257': {'count': 50}, '258': {'count': 50}, '259': {'count': 50}, '260': {'count': 50}, '261': {'count': 50}, '262': {'count': 50}, '263': {'count': 50}, '264': {'count': 50}, '265': {'count': 50}, '266': {'count': 50}, '267': {'count': 50}, '268': {'count': 50}, '269': {'count': 50}, '270': {'count': 50}, '271': {'count': 50}, '272': {'count': 50}, '273': {'count': 50}, '274': {'count': 50}, '275': {'count': 50}, '276': {'count': 50}, '277': {'count': 50}, '278': {'count': 50}, '279': {'count': 50}, '280': {'count': 50}, '281': {'count': 50}, '282': {'count': 50}, '283': {'count': 50}, '284': {'count': 50}, '285': {'count': 50}, '286': {'count': 50}, '287': {'count': 50}, '288': {'count': 50}, '289': {'count': 50}, '290': {'count': 50}, '291': {'count': 50}, '292': {'count': 50}, '293': {'count': 50}, '294': {'count': 50}, '295': {'count': 50}, '296': {'count': 50}, '297': {'count': 50}, '298': {'count': 50}, '299': {'count': 50}, '300': {'count': 50}, '301': {'count': 50}, '302': {'count': 50}, '303': {'count': 50}, '304': {'count': 50}, '305': {'count': 50}, '306': {'count': 50}, '307': {'count': 50}, '308': {'count': 50}, '309': {'count': 50}, '310': {'count': 50}, '311': {'count': 50}, '312': {'count': 50}, '313': {'count': 50}, '314': {'count': 50}, '315': {'count': 50}, '316': {'count': 50}, '317': {'count': 50}, '318': {'count': 50}, '319': {'count': 50}, '320': {'count': 50}, '321': {'count': 50}, '322': {'count': 50}, '323': {'count': 50}, '324': {'count': 50}, '325': {'count': 50}, '326': {'count': 50}, '327': {'count': 50}, '328': {'count': 50}, '329': {'count': 50}, '330': {'count': 50}, '331': {'count': 50}, '332': {'count': 50}, '333': {'count': 50}, '334': {'count': 50}, '335': {'count': 50}, '336': {'count': 50}, '337': {'count': 50}, '338': {'count': 50}, '339': {'count': 50}, '340': {'count': 50}, '341': {'count': 50}, '342': {'count': 50}, '343': {'count': 50}, '344': {'count': 50}, '345': {'count': 50}, '346': {'count': 50}, '347': {'count': 50}, '348': {'count': 50}, '349': {'count': 50}, '350': {'count': 50}, '351': {'count': 50}, '352': {'count': 50}, '353': {'count': 50}, '354': {'count': 50}, '355': {'count': 50}, '356': {'count': 50}, '357': {'count': 50}, '358': {'count': 50}, '359': {'count': 50}, '360': {'count': 50}, '361': {'count': 50}, '362': {'count': 50}, '363': {'count': 50}, '364': {'count': 50}, '365': {'count': 50}, '366': {'count': 50}, '367': {'count': 50}, '368': {'count': 50}, '369': {'count': 50}, '370': {'count': 50}, '371': {'count': 50}, '372': {'count': 50}, '373': {'count': 50}, '374': {'count': 50}, '375': {'count': 50}, '376': {'count': 50}, '377': {'count': 50}, '378': {'count': 50}, '379': {'count': 50}, '380': {'count': 50}, '381': {'count': 50}, '382': {'count': 50}, '383': {'count': 50}, '384': {'count': 50}, '385': {'count': 50}, '386': {'count': 50}, '387': {'count': 50}, '388': {'count': 50}, '389': {'count': 50}, '390': {'count': 50}, '391': {'count': 50}, '392': {'count': 50}, '393': {'count': 50}, '394': {'count': 50}, '395': {'count': 50}, '396': {'count': 50}, '397': {'count': 50}, '398': {'count': 50}, '399': {'count': 50}, '400': {'count': 50}, '401': {'count': 50}, '402': {'count': 50}, '403': {'count': 50}, '404': {'count': 50}, '405': {'count': 50}, '406': {'count': 50}, '407': {'count': 50}, '408': {'count': 50}, '409': {'count': 50}, '410': {'count': 50}, '411': {'count': 50}, '412': {'count': 50}, '413': {'count': 50}, '414': {'count': 50}, '415': {'count': 50}, '416': {'count': 50}, '417': {'count': 50}, '418': {'count': 50}, '419': {'count': 50}, '420': {'count': 50}, '421': {'count': 50}, '422': {'count': 50}, '423': {'count': 50}, '424': {'count': 50}, '425': {'count': 50}, '426': {'count': 50}, '427': {'count': 50}, '428': {'count': 50}, '429': {'count': 50}, '430': {'count': 50}, '431': {'count': 50}, '432': {'count': 50}, '433': {'count': 50}, '434': {'count': 50}, '435': {'count': 50}, '436': {'count': 50}, '437': {'count': 50}, '438': {'count': 50}, '439': {'count': 50}, '440': {'count': 50}, '441': {'count': 50}, '442': {'count': 50}, '443': {'count': 50}, '444': {'count': 50}, '445': {'count': 50}, '446': {'count': 50}, '447': {'count': 50}, '448': {'count': 50}, '449': {'count': 50}, '450': {'count': 50}, '451': {'count': 50}, '452': {'count': 50}, '453': {'count': 50}, '454': {'count': 50}, '455': {'count': 50}, '456': {'count': 50}, '457': {'count': 50}, '458': {'count': 50}, '459': {'count': 50}, '460': {'count': 50}, '461': {'count': 50}, '462': {'count': 50}, '463': {'count': 50}, '464': {'count': 50}, '465': {'count': 50}, '466': {'count': 50}, '467': {'count': 50}, '468': {'count': 50}, '469': {'count': 50}, '470': {'count': 50}, '471': {'count': 50}, '472': {'count': 50}, '473': {'count': 50}, '474': {'count': 50}, '475': {'count': 50}, '476': {'count': 50}, '477': {'count': 50}, '478': {'count': 50}, '479': {'count': 50}, '480': {'count': 50}, '481': {'count': 50}, '482': {'count': 50}, '483': {'count': 50}, '484': {'count': 50}, '485': {'count': 50}, '486': {'count': 50}, '487': {'count': 50}, '488': {'count': 50}, '489': {'count': 50}, '490': {'count': 50}, '491': {'count': 50}, '492': {'count': 50}, '493': {'count': 50}, '494': {'count': 50}, '495': {'count': 50}, '496': {'count': 50}, '497': {'count': 50}, '498': {'count': 50}, '499': {'count': 50}, '500': {'count': 50}, '501': {'count': 50}, '502': {'count': 50}, '503': {'count': 50}, '504': {'count': 50}, '505': {'count': 50}, '506': {'count': 50}, '507': {'count': 50}, '508': {'count': 50}, '509': {'count': 50}, '510': {'count': 50}, '511': {'count': 50}, '512': {'count': 50}, '513': {'count': 50}, '514': {'count': 50}, '515': {'count': 50}, '516': {'count': 50}, '517': {'count': 50}, '518': {'count': 50}, '519': {'count': 50}, '520': {'count': 50}, '521': {'count': 50}, '522': {'count': 50}, '523': {'count': 50}, '524': {'count': 50}, '525': {'count': 50}, '526': {'count': 50}, '527': {'count': 50}, '528': {'count': 50}, '529': {'count': 50}, '530': {'count': 50}, '531': {'count': 50}, '532': {'count': 50}, '533': {'count': 50}, '534': {'count': 50}, '535': {'count': 50}, '536': {'count': 50}, '537': {'count': 50}, '538': {'count': 50}, '539': {'count': 50}, '540': {'count': 50}, '541': {'count': 50}, '542': {'count': 50}, '543': {'count': 50}, '544': {'count': 50}, '545': {'count': 50}, '546': {'count': 50}, '547': {'count': 50}, '548': {'count': 50}, '549': {'count': 50}, '550': {'count': 50}, '551': {'count': 50}, '552': {'count': 50}, '553': {'count': 50}, '554': {'count': 50}, '555': {'count': 50}, '556': {'count': 50}, '557': {'count': 50}, '558': {'count': 50}, '559': {'count': 50}, '560': {'count': 50}, '561': {'count': 50}, '562': {'count': 50}, '563': {'count': 50}, '564': {'count': 50}, '565': {'count': 50}, '566': {'count': 50}, '567': {'count': 50}, '568': {'count': 50}, '569': {'count': 50}, '570': {'count': 50}, '571': {'count': 50}, '572': {'count': 50}, '573': {'count': 50}, '574': {'count': 50}, '575': {'count': 50}, '576': {'count': 50}, '577': {'count': 50}, '578': {'count': 50}, '579': {'count': 50}, '580': {'count': 50}, '581': {'count': 50}, '582': {'count': 50}, '583': {'count': 50}, '584': {'count': 50}, '585': {'count': 50}, '586': {'count': 50}, '587': {'count': 50}, '588': {'count': 50}, '589': {'count': 50}, '590': {'count': 50}, '591': {'count': 50}, '592': {'count': 50}, '593': {'count': 50}, '594': {'count': 50}, '595': {'count': 50}, '596': {'count': 50}, '597': {'count': 50}, '598': {'count': 50}, '599': {'count': 50}, '600': {'count': 50}, '601': {'count': 50}, '602': {'count': 50}, '603': {'count': 50}, '604': {'count': 50}, '605': {'count': 50}, '606': {'count': 50}, '607': {'count': 50}, '608': {'count': 50}, '609': {'count': 50}, '610': {'count': 50}, '611': {'count': 50}, '612': {'count': 50}, '613': {'count': 50}, '614': {'count': 50}, '615': {'count': 50}, '616': {'count': 50}, '617': {'count': 50}, '618': {'count': 50}, '619': {'count': 50}, '620': {'count': 50}, '621': {'count': 50}, '622': {'count': 50}, '623': {'count': 50}, '624': {'count': 50}, '625': {'count': 50}, '626': {'count': 50}, '627': {'count': 50}, '628': {'count': 50}, '629': {'count': 50}, '630': {'count': 50}, '631': {'count': 50}, '632': {'count': 50}, '633': {'count': 50}, '634': {'count': 50}, '635': {'count': 50}, '636': {'count': 50}, '637': {'count': 50}, '638': {'count': 50}, '639': {'count': 50}, '640': {'count': 50}, '641': {'count': 50}, '642': {'count': 50}, '643': {'count': 50}, '644': {'count': 50}, '645': {'count': 50}, '646': {'count': 50}, '647': {'count': 50}, '648': {'count': 50}, '649': {'count': 50}, '650': {'count': 50}, '651': {'count': 50}, '652': {'count': 50}, '653': {'count': 50}, '654': {'count': 50}, '655': {'count': 50}, '656': {'count': 50}, '657': {'count': 50}, '658': {'count': 50}, '659': {'count': 50}, '660': {'count': 50}, '661': {'count': 50}, '662': {'count': 50}, '663': {'count': 50}, '664': {'count': 50}, '665': {'count': 50}, '666': {'count': 50}, '667': {'count': 50}, '668': {'count': 50}, '669': {'count': 50}, '670': {'count': 50}, '671': {'count': 50}, '672': {'count': 50}, '673': {'count': 50}, '674': {'count': 50}, '675': {'count': 50}, '676': {'count': 50}, '677': {'count': 50}, '678': {'count': 50}, '679': {'count': 50}, '680': {'count': 50}, '681': {'count': 50}, '682': {'count': 50}, '683': {'count': 50}, '684': {'count': 50}, '685': {'count': 50}, '686': {'count': 50}, '687': {'count': 50}, '688': {'count': 50}, '689': {'count': 50}, '690': {'count': 50}, '691': {'count': 50}, '692': {'count': 50}, '693': {'count': 50}, '694': {'count': 50}, '695': {'count': 50}, '696': {'count': 50}, '697': {'count': 50}, '698': {'count': 50}, '699': {'count': 50}, '700': {'count': 50}, '701': {'count': 50}, '702': {'count': 50}, '703': {'count': 50}, '704': {'count': 50}, '705': {'count': 50}, '706': {'count': 50}, '707': {'count': 50}, '708': {'count': 50}, '709': {'count': 50}, '710': {'count': 50}, '711': {'count': 50}, '712': {'count': 50}, '713': {'count': 50}, '714': {'count': 50}, '715': {'count': 50}, '716': {'count': 50}, '717': {'count': 50}, '718': {'count': 50}, '719': {'count': 50}, '720': {'count': 50}, '721': {'count': 50}, '722': {'count': 50}, '723': {'count': 50}, '724': {'count': 50}, '725': {'count': 50}, '726': {'count': 50}, '727': {'count': 50}, '728': {'count': 50}, '729': {'count': 50}, '730': {'count': 50}, '731': {'count': 50}, '732': {'count': 50}, '733': {'count': 50}, '734': {'count': 50}, '735': {'count': 50}, '736': {'count': 50}, '737': {'count': 50}, '738': {'count': 50}, '739': {'count': 50}, '740': {'count': 50}, '741': {'count': 50}, '742': {'count': 50}, '743': {'count': 50}, '744': {'count': 50}, '745': {'count': 50}, '746': {'count': 50}, '747': {'count': 50}, '748': {'count': 50}, '749': {'count': 50}, '750': {'count': 50}, '751': {'count': 50}, '752': {'count': 50}, '753': {'count': 50}, '754': {'count': 50}, '755': {'count': 50}, '756': {'count': 50}, '757': {'count': 50}, '758': {'count': 50}, '759': {'count': 50}, '760': {'count': 50}, '761': {'count': 50}, '762': {'count': 50}, '763': {'count': 50}, '764': {'count': 50}, '765': {'count': 50}, '766': {'count': 50}, '767': {'count': 50}, '768': {'count': 50}, '769': {'count': 50}, '770': {'count': 50}, '771': {'count': 50}, '772': {'count': 50}, '773': {'count': 50}, '774': {'count': 50}, '775': {'count': 50}, '776': {'count': 50}, '777': {'count': 50}, '778': {'count': 50}, '779': {'count': 50}, '780': {'count': 50}, '781': {'count': 50}, '782': {'count': 50}, '783': {'count': 50}, '784': {'count': 50}, '785': {'count': 50}, '786': {'count': 50}, '787': {'count': 50}, '788': {'count': 50}, '789': {'count': 50}, '790': {'count': 50}, '791': {'count': 50}, '792': {'count': 50}, '793': {'count': 50}, '794': {'count': 50}, '795': {'count': 50}, '796': {'count': 50}, '797': {'count': 50}, '798': {'count': 50}, '799': {'count': 50}, '800': {'count': 50}, '801': {'count': 50}, '802': {'count': 50}, '803': {'count': 50}, '804': {'count': 50}, '805': {'count': 50}, '806': {'count': 50}, '807': {'count': 50}, '808': {'count': 50}, '809': {'count': 50}, '810': {'count': 50}, '811': {'count': 50}, '812': {'count': 50}, '813': {'count': 50}, '814': {'count': 50}, '815': {'count': 50}, '816': {'count': 50}, '817': {'count': 50}, '818': {'count': 50}, '819': {'count': 50}, '820': {'count': 50}, '821': {'count': 50}, '822': {'count': 50}, '823': {'count': 50}, '824': {'count': 50}, '825': {'count': 50}, '826': {'count': 50}, '827': {'count': 50}, '828': {'count': 50}, '829': {'count': 50}, '830': {'count': 50}, '831': {'count': 50}, '832': {'count': 50}, '833': {'count': 50}, '834': {'count': 50}, '835': {'count': 50}, '836': {'count': 50}, '837': {'count': 50}, '838': {'count': 50}, '839': {'count': 50}, '840': {'count': 50}, '841': {'count': 50}, '842': {'count': 50}, '843': {'count': 50}, '844': {'count': 50}, '845': {'count': 50}, '846': {'count': 50}, '847': {'count': 50}, '848': {'count': 50}, '849': {'count': 50}, '850': {'count': 50}, '851': {'count': 50}, '852': {'count': 50}, '853': {'count': 50}, '854': {'count': 50}, '855': {'count': 50}, '856': {'count': 50}, '857': {'count': 50}, '858': {'count': 50}, '859': {'count': 50}, '860': {'count': 50}, '861': {'count': 50}, '862': {'count': 50}, '863': {'count': 50}, '864': {'count': 50}, '865': {'count': 50}, '866': {'count': 50}, '867': {'count': 50}, '868': {'count': 50}, '869': {'count': 50}, '870': {'count': 50}, '871': {'count': 50}, '872': {'count': 50}, '873': {'count': 50}, '874': {'count': 50}, '875': {'count': 50}, '876': {'count': 50}, '877': {'count': 50}, '878': {'count': 50}, '879': {'count': 50}, '880': {'count': 50}, '881': {'count': 50}, '882': {'count': 50}, '883': {'count': 50}, '884': {'count': 50}, '885': {'count': 50}, '886': {'count': 50}, '887': {'count': 50}, '888': {'count': 50}, '889': {'count': 50}, '890': {'count': 50}, '891': {'count': 50}, '892': {'count': 50}, '893': {'count': 50}, '894': {'count': 50}, '895': {'count': 50}, '896': {'count': 50}, '897': {'count': 50}, '898': {'count': 50}, '899': {'count': 50}, '900': {'count': 50}, '901': {'count': 50}, '902': {'count': 50}, '903': {'count': 50}, '904': {'count': 50}, '905': {'count': 50}, '906': {'count': 50}, '907': {'count': 50}, '908': {'count': 50}, '909': {'count': 50}, '910': {'count': 50}, '911': {'count': 50}, '912': {'count': 50}, '913': {'count': 50}, '914': {'count': 50}, '915': {'count': 50}, '916': {'count': 50}, '917': {'count': 50}, '918': {'count': 50}, '919': {'count': 50}, '920': {'count': 50}, '921': {'count': 50}, '922': {'count': 50}, '923': {'count': 50}, '924': {'count': 50}, '925': {'count': 50}, '926': {'count': 50}, '927': {'count': 50}, '928': {'count': 50}, '929': {'count': 50}, '930': {'count': 50}, '931': {'count': 50}, '932': {'count': 50}, '933': {'count': 50}, '934': {'count': 50}, '935': {'count': 50}, '936': {'count': 50}, '937': {'count': 50}, '938': {'count': 50}, '939': {'count': 50}, '940': {'count': 50}, '941': {'count': 50}, '942': {'count': 50}, '943': {'count': 50}, '944': {'count': 50}, '945': {'count': 50}, '946': {'count': 50}, '947': {'count': 50}, '948': {'count': 50}, '949': {'count': 50}, '950': {'count': 50}, '951': {'count': 50}, '952': {'count': 50}, '953': {'count': 50}, '954': {'count': 50}, '955': {'count': 50}, '956': {'count': 50}, '957': {'count': 50}, '958': {'count': 50}, '959': {'count': 50}, '960': {'count': 50}, '961': {'count': 50}, '962': {'count': 50}, '963': {'count': 50}, '964': {'count': 50}, '965': {'count': 50}, '966': {'count': 50}, '967': {'count': 50}, '968': {'count': 50}, '969': {'count': 50}, '970': {'count': 50}, '971': {'count': 50}, '972': {'count': 50}, '973': {'count': 50}, '974': {'count': 50}, '975': {'count': 50}, '976': {'count': 50}, '977': {'count': 50}, '978': {'count': 50}, '979': {'count': 50}, '980': {'count': 50}, '981': {'count': 50}, '982': {'count': 50}, '983': {'count': 50}, '984': {'count': 50}, '985': {'count': 50}, '986': {'count': 50}, '987': {'count': 50}, '988': {'count': 50}, '989': {'count': 50}, '990': {'count': 50}, '991': {'count': 50}, '992': {'count': 50}, '993': {'count': 50}, '994': {'count': 50}, '995': {'count': 50}, '996': {'count': 50}, '997': {'count': 50}, '998': {'count': 50}, '999': {'count': 50}}}} | -| [ImdbClassification](http://www.aclweb.org/anthology/P11-1015) (Maas et al., 2011) | ['eng'] | Classification | p2p | [Reviews, Written] | None | None | -| [InappropriatenessClassification](https://aclanthology.org/2021.bsnlp-1.4) (Babakov et al., 2021) | ['rus'] | Classification | s2s | [Social, Web, Written] | None | None | +| [ImdbClassification.v2](http://www.aclweb.org/anthology/P11-1015) (Maas et al., 2011) | ['eng'] | Classification | p2p | [Reviews, Written] | None | None | +| [ImdbVNClassification](http://www.aclweb.org/anthology/P11-1015) (Loc Pham, 2025) | ['vie'] | Classification | p2p | [Reviews, Written] | None | None | +| [InappropriatenessClassification.v2](https://aclanthology.org/2021.bsnlp-1.4) (Babakov et al., 2021) | ['rus'] | Classification | s2s | [Social, Web, Written] | None | None | | [InappropriatenessClassificationv2](https://aclanthology.org/2021.bsnlp-1.4) (Babakov et al., 2021) | ['rus'] | Classification | t2t | [Social, Web, Written] | None | None | | [IndicCrosslingualSTS](https://huggingface.co/datasets/jaygala24/indic_sts) (Ramesh et al., 2022) | ['asm', 'ben', 'eng', 'guj', 'hin', 'kan', 'mal', 'mar', 'ory', 'pan', 'tam', 'tel', 'urd'] | STS | s2s | [Government, News, Non-fiction, Spoken, Spoken, Web, Written] | None | None | | [IndicGenBenchFloresBitextMining](https://github.com/google-research-datasets/indic-gen-bench/) (Harman Singh, 2024) | ['asm', 'awa', 'ben', 'bgc', 'bho', 'bod', 'boy', 'eng', 'gbm', 'gom', 'guj', 'hin', 'hne', 'kan', 'mai', 'mal', 'mar', 'mni', 'mup', 'mwr', 'nep', 'ory', 'pan', 'pus', 'raj', 'san', 'sat', 'tam', 'tel', 'urd'] | BitextMining | s2s | [News, Web, Written] | {'validation': 57826, 'test': 58696} | {'validation': {'num_samples': 57826, 'number_of_characters': 14600950, 'unique_pairs': 57826, 'min_sentence1_length': 24, 'average_sentence1_length': 126.25, 'max_sentence1_length': 368, 'unique_sentence1': 29903, 'min_sentence2_length': 24, 'average_sentence2_length': 126.24, 'max_sentence2_length': 368, 'unique_sentence2': 29903, 'hf_subset_descriptive_stats': {'ben-eng': {'num_samples': 997, 'number_of_characters': 248469, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 123.65, 'max_sentence1_length': 320, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-ben': {'num_samples': 997, 'number_of_characters': 248469, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 123.65, 'max_sentence2_length': 320, 'unique_sentence2': 997}, 'guj-eng': {'num_samples': 997, 'number_of_characters': 245477, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 120.64, 'max_sentence1_length': 368, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-guj': {'num_samples': 997, 'number_of_characters': 245477, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 120.64, 'max_sentence2_length': 368, 'unique_sentence2': 997}, 'hin-eng': {'num_samples': 997, 'number_of_characters': 250573, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 125.76, 'max_sentence1_length': 355, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-hin': {'num_samples': 997, 'number_of_characters': 250564, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 125.75, 'max_sentence2_length': 355, 'unique_sentence2': 997}, 'kan-eng': {'num_samples': 997, 'number_of_characters': 257131, 'unique_pairs': 997, 'min_sentence1_length': 34, 'average_sentence1_length': 132.33, 'max_sentence1_length': 331, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-kan': {'num_samples': 997, 'number_of_characters': 256986, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 34, 'average_sentence2_length': 132.19, 'max_sentence2_length': 331, 'unique_sentence2': 997}, 'mal-eng': {'num_samples': 997, 'number_of_characters': 267295, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 142.53, 'max_sentence1_length': 360, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mal': {'num_samples': 997, 'number_of_characters': 267296, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 142.53, 'max_sentence2_length': 360, 'unique_sentence2': 997}, 'mar-eng': {'num_samples': 997, 'number_of_characters': 251107, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 126.29, 'max_sentence1_length': 321, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mar': {'num_samples': 997, 'number_of_characters': 250897, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 126.08, 'max_sentence2_length': 321, 'unique_sentence2': 997}, 'tam-eng': {'num_samples': 997, 'number_of_characters': 271322, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 146.57, 'max_sentence1_length': 358, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-tam': {'num_samples': 997, 'number_of_characters': 271322, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 146.57, 'max_sentence2_length': 358, 'unique_sentence2': 997}, 'tel-eng': {'num_samples': 997, 'number_of_characters': 252385, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 127.57, 'max_sentence1_length': 317, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-tel': {'num_samples': 997, 'number_of_characters': 252380, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 127.57, 'max_sentence2_length': 317, 'unique_sentence2': 997}, 'urd-eng': {'num_samples': 997, 'number_of_characters': 249824, 'unique_pairs': 997, 'min_sentence1_length': 37, 'average_sentence1_length': 125.01, 'max_sentence1_length': 295, 'unique_sentence1': 996, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-urd': {'num_samples': 997, 'number_of_characters': 249824, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 37, 'average_sentence2_length': 125.01, 'max_sentence2_length': 295, 'unique_sentence2': 996}, 'asm-eng': {'num_samples': 997, 'number_of_characters': 246220, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 121.39, 'max_sentence1_length': 314, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-asm': {'num_samples': 997, 'number_of_characters': 246224, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 121.39, 'max_sentence2_length': 314, 'unique_sentence2': 997}, 'bho-eng': {'num_samples': 997, 'number_of_characters': 246895, 'unique_pairs': 997, 'min_sentence1_length': 25, 'average_sentence1_length': 122.07, 'max_sentence1_length': 326, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-bho': {'num_samples': 997, 'number_of_characters': 246919, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 25, 'average_sentence2_length': 122.09, 'max_sentence2_length': 326, 'unique_sentence2': 997}, 'nep-eng': {'num_samples': 997, 'number_of_characters': 245984, 'unique_pairs': 997, 'min_sentence1_length': 24, 'average_sentence1_length': 121.15, 'max_sentence1_length': 307, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-nep': {'num_samples': 997, 'number_of_characters': 245984, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 24, 'average_sentence2_length': 121.15, 'max_sentence2_length': 307, 'unique_sentence2': 997}, 'ory-eng': {'num_samples': 997, 'number_of_characters': 254206, 'unique_pairs': 997, 'min_sentence1_length': 34, 'average_sentence1_length': 129.4, 'max_sentence1_length': 308, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-ory': {'num_samples': 997, 'number_of_characters': 254206, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 34, 'average_sentence2_length': 129.4, 'max_sentence2_length': 308, 'unique_sentence2': 997}, 'pan-eng': {'num_samples': 997, 'number_of_characters': 251598, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 126.78, 'max_sentence1_length': 309, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-pan': {'num_samples': 997, 'number_of_characters': 251597, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 126.78, 'max_sentence2_length': 309, 'unique_sentence2': 997}, 'pus-eng': {'num_samples': 997, 'number_of_characters': 247450, 'unique_pairs': 997, 'min_sentence1_length': 32, 'average_sentence1_length': 122.62, 'max_sentence1_length': 300, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-pus': {'num_samples': 997, 'number_of_characters': 247450, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 32, 'average_sentence2_length': 122.62, 'max_sentence2_length': 300, 'unique_sentence2': 997}, 'san-eng': {'num_samples': 997, 'number_of_characters': 249042, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 124.22, 'max_sentence1_length': 311, 'unique_sentence1': 994, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-san': {'num_samples': 997, 'number_of_characters': 248877, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 124.06, 'max_sentence2_length': 311, 'unique_sentence2': 994}, 'awa-eng': {'num_samples': 997, 'number_of_characters': 247944, 'unique_pairs': 997, 'min_sentence1_length': 34, 'average_sentence1_length': 123.12, 'max_sentence1_length': 329, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-awa': {'num_samples': 997, 'number_of_characters': 247884, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 34, 'average_sentence2_length': 123.06, 'max_sentence2_length': 329, 'unique_sentence2': 997}, 'bgc-eng': {'num_samples': 997, 'number_of_characters': 245935, 'unique_pairs': 997, 'min_sentence1_length': 27, 'average_sentence1_length': 121.1, 'max_sentence1_length': 303, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-bgc': {'num_samples': 997, 'number_of_characters': 245935, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 27, 'average_sentence2_length': 121.1, 'max_sentence2_length': 303, 'unique_sentence2': 997}, 'bod-eng': {'num_samples': 997, 'number_of_characters': 266515, 'unique_pairs': 997, 'min_sentence1_length': 26, 'average_sentence1_length': 141.75, 'max_sentence1_length': 355, 'unique_sentence1': 996, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-bod': {'num_samples': 997, 'number_of_characters': 266495, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 26, 'average_sentence2_length': 141.73, 'max_sentence2_length': 355, 'unique_sentence2': 996}, 'boy-eng': {'num_samples': 997, 'number_of_characters': 260174, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 135.39, 'max_sentence1_length': 312, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-boy': {'num_samples': 997, 'number_of_characters': 260174, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 135.39, 'max_sentence2_length': 312, 'unique_sentence2': 997}, 'gbm-eng': {'num_samples': 997, 'number_of_characters': 247009, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 122.18, 'max_sentence1_length': 344, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-gbm': {'num_samples': 997, 'number_of_characters': 247009, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 122.18, 'max_sentence2_length': 344, 'unique_sentence2': 997}, 'gom-eng': {'num_samples': 997, 'number_of_characters': 244553, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 119.72, 'max_sentence1_length': 306, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-gom': {'num_samples': 997, 'number_of_characters': 244553, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 119.72, 'max_sentence2_length': 306, 'unique_sentence2': 997}, 'hne-eng': {'num_samples': 997, 'number_of_characters': 246416, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 121.59, 'max_sentence1_length': 321, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-hne': {'num_samples': 997, 'number_of_characters': 246405, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 121.58, 'max_sentence2_length': 321, 'unique_sentence2': 997}, 'raj-eng': {'num_samples': 997, 'number_of_characters': 249541, 'unique_pairs': 997, 'min_sentence1_length': 32, 'average_sentence1_length': 124.72, 'max_sentence1_length': 313, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-raj': {'num_samples': 997, 'number_of_characters': 249541, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 32, 'average_sentence2_length': 124.72, 'max_sentence2_length': 313, 'unique_sentence2': 997}, 'mai-eng': {'num_samples': 997, 'number_of_characters': 247991, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 123.17, 'max_sentence1_length': 312, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mai': {'num_samples': 997, 'number_of_characters': 247994, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 123.17, 'max_sentence2_length': 312, 'unique_sentence2': 997}, 'mni-eng': {'num_samples': 997, 'number_of_characters': 254308, 'unique_pairs': 997, 'min_sentence1_length': 39, 'average_sentence1_length': 129.5, 'max_sentence1_length': 310, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mni': {'num_samples': 997, 'number_of_characters': 254312, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 39, 'average_sentence2_length': 129.51, 'max_sentence2_length': 310, 'unique_sentence2': 997}, 'mup-eng': {'num_samples': 997, 'number_of_characters': 248486, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 123.66, 'max_sentence1_length': 312, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mup': {'num_samples': 997, 'number_of_characters': 248486, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 123.66, 'max_sentence2_length': 312, 'unique_sentence2': 997}, 'mwr-eng': {'num_samples': 997, 'number_of_characters': 248641, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 123.82, 'max_sentence1_length': 324, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mwr': {'num_samples': 997, 'number_of_characters': 248641, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 123.82, 'max_sentence2_length': 324, 'unique_sentence2': 997}, 'sat-eng': {'num_samples': 997, 'number_of_characters': 258279, 'unique_pairs': 997, 'min_sentence1_length': 37, 'average_sentence1_length': 133.49, 'max_sentence1_length': 333, 'unique_sentence1': 995, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-sat': {'num_samples': 997, 'number_of_characters': 258279, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 37, 'average_sentence2_length': 133.49, 'max_sentence2_length': 333, 'unique_sentence2': 995}}}, 'test': {'num_samples': 58696, 'number_of_characters': 15359416, 'unique_pairs': 58690, 'min_sentence1_length': 33, 'average_sentence1_length': 130.84, 'max_sentence1_length': 431, 'unique_sentence1': 30351, 'min_sentence2_length': 33, 'average_sentence2_length': 130.83, 'max_sentence2_length': 431, 'unique_sentence2': 30351, 'hf_subset_descriptive_stats': {'ben-eng': {'num_samples': 1012, 'number_of_characters': 261008, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 127.51, 'max_sentence1_length': 333, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-ben': {'num_samples': 1012, 'number_of_characters': 261008, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 127.51, 'max_sentence2_length': 333, 'unique_sentence2': 1012}, 'guj-eng': {'num_samples': 1012, 'number_of_characters': 258394, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 124.93, 'max_sentence1_length': 349, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-guj': {'num_samples': 1012, 'number_of_characters': 258394, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 124.93, 'max_sentence2_length': 349, 'unique_sentence2': 1012}, 'hin-eng': {'num_samples': 1012, 'number_of_characters': 263040, 'unique_pairs': 1012, 'min_sentence1_length': 41, 'average_sentence1_length': 129.52, 'max_sentence1_length': 381, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-hin': {'num_samples': 1012, 'number_of_characters': 263029, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 41, 'average_sentence2_length': 129.51, 'max_sentence2_length': 381, 'unique_sentence2': 1012}, 'kan-eng': {'num_samples': 1012, 'number_of_characters': 270091, 'unique_pairs': 1012, 'min_sentence1_length': 43, 'average_sentence1_length': 136.49, 'max_sentence1_length': 388, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-kan': {'num_samples': 1012, 'number_of_characters': 270021, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 43, 'average_sentence2_length': 136.42, 'max_sentence2_length': 388, 'unique_sentence2': 1012}, 'mal-eng': {'num_samples': 1012, 'number_of_characters': 281302, 'unique_pairs': 1012, 'min_sentence1_length': 48, 'average_sentence1_length': 147.57, 'max_sentence1_length': 376, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mal': {'num_samples': 1012, 'number_of_characters': 281302, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 48, 'average_sentence2_length': 147.57, 'max_sentence2_length': 376, 'unique_sentence2': 1012}, 'mar-eng': {'num_samples': 1012, 'number_of_characters': 265212, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 131.67, 'max_sentence1_length': 356, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mar': {'num_samples': 1012, 'number_of_characters': 265023, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 131.48, 'max_sentence2_length': 355, 'unique_sentence2': 1012}, 'tam-eng': {'num_samples': 1012, 'number_of_characters': 286099, 'unique_pairs': 1012, 'min_sentence1_length': 48, 'average_sentence1_length': 152.31, 'max_sentence1_length': 404, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-tam': {'num_samples': 1012, 'number_of_characters': 286099, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 48, 'average_sentence2_length': 152.31, 'max_sentence2_length': 404, 'unique_sentence2': 1012}, 'tel-eng': {'num_samples': 1012, 'number_of_characters': 264460, 'unique_pairs': 1012, 'min_sentence1_length': 39, 'average_sentence1_length': 130.92, 'max_sentence1_length': 359, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-tel': {'num_samples': 1012, 'number_of_characters': 264447, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 39, 'average_sentence2_length': 130.91, 'max_sentence2_length': 359, 'unique_sentence2': 1012}, 'urd-eng': {'num_samples': 1012, 'number_of_characters': 261886, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 128.38, 'max_sentence1_length': 348, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-urd': {'num_samples': 1012, 'number_of_characters': 261885, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 128.38, 'max_sentence2_length': 348, 'unique_sentence2': 1012}, 'asm-eng': {'num_samples': 1012, 'number_of_characters': 257902, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 124.44, 'max_sentence1_length': 329, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-asm': {'num_samples': 1012, 'number_of_characters': 257909, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 124.45, 'max_sentence2_length': 329, 'unique_sentence2': 1012}, 'bho-eng': {'num_samples': 1012, 'number_of_characters': 260578, 'unique_pairs': 1012, 'min_sentence1_length': 36, 'average_sentence1_length': 127.09, 'max_sentence1_length': 367, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-bho': {'num_samples': 1012, 'number_of_characters': 260601, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 36, 'average_sentence2_length': 127.11, 'max_sentence2_length': 367, 'unique_sentence2': 1012}, 'nep-eng': {'num_samples': 1012, 'number_of_characters': 258869, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 125.4, 'max_sentence1_length': 362, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-nep': {'num_samples': 1012, 'number_of_characters': 258869, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 125.4, 'max_sentence2_length': 362, 'unique_sentence2': 1012}, 'ory-eng': {'num_samples': 1012, 'number_of_characters': 266805, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 133.24, 'max_sentence1_length': 354, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-ory': {'num_samples': 1012, 'number_of_characters': 266805, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 133.24, 'max_sentence2_length': 354, 'unique_sentence2': 1012}, 'pan-eng': {'num_samples': 1012, 'number_of_characters': 265391, 'unique_pairs': 1012, 'min_sentence1_length': 37, 'average_sentence1_length': 131.84, 'max_sentence1_length': 380, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-pan': {'num_samples': 1012, 'number_of_characters': 265391, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 37, 'average_sentence2_length': 131.84, 'max_sentence2_length': 380, 'unique_sentence2': 1012}, 'pus-eng': {'num_samples': 1012, 'number_of_characters': 254422, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 121.0, 'max_sentence1_length': 325, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-pus': {'num_samples': 1012, 'number_of_characters': 254421, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 121.0, 'max_sentence2_length': 325, 'unique_sentence2': 1012}, 'san-eng': {'num_samples': 1012, 'number_of_characters': 260339, 'unique_pairs': 1012, 'min_sentence1_length': 33, 'average_sentence1_length': 126.85, 'max_sentence1_length': 358, 'unique_sentence1': 1011, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-san': {'num_samples': 1012, 'number_of_characters': 260224, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 33, 'average_sentence2_length': 126.74, 'max_sentence2_length': 358, 'unique_sentence2': 1011}, 'awa-eng': {'num_samples': 1012, 'number_of_characters': 260179, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 126.69, 'max_sentence1_length': 378, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-awa': {'num_samples': 1012, 'number_of_characters': 260137, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 126.65, 'max_sentence2_length': 378, 'unique_sentence2': 1012}, 'bgc-eng': {'num_samples': 1012, 'number_of_characters': 257450, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 124.0, 'max_sentence1_length': 332, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-bgc': {'num_samples': 1012, 'number_of_characters': 257450, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 124.0, 'max_sentence2_length': 332, 'unique_sentence2': 1012}, 'bod-eng': {'num_samples': 1012, 'number_of_characters': 280188, 'unique_pairs': 1012, 'min_sentence1_length': 42, 'average_sentence1_length': 146.46, 'max_sentence1_length': 431, 'unique_sentence1': 1009, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-bod': {'num_samples': 1012, 'number_of_characters': 280126, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 42, 'average_sentence2_length': 146.4, 'max_sentence2_length': 431, 'unique_sentence2': 1009}, 'boy-eng': {'num_samples': 1012, 'number_of_characters': 277538, 'unique_pairs': 1012, 'min_sentence1_length': 36, 'average_sentence1_length': 143.85, 'max_sentence1_length': 396, 'unique_sentence1': 1011, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-boy': {'num_samples': 1012, 'number_of_characters': 277538, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 36, 'average_sentence2_length': 143.85, 'max_sentence2_length': 396, 'unique_sentence2': 1011}, 'gbm-eng': {'num_samples': 1012, 'number_of_characters': 261027, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 127.53, 'max_sentence1_length': 333, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-gbm': {'num_samples': 1012, 'number_of_characters': 261027, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 127.53, 'max_sentence2_length': 333, 'unique_sentence2': 1012}, 'gom-eng': {'num_samples': 1012, 'number_of_characters': 259182, 'unique_pairs': 1012, 'min_sentence1_length': 37, 'average_sentence1_length': 125.71, 'max_sentence1_length': 335, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-gom': {'num_samples': 1012, 'number_of_characters': 259182, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 37, 'average_sentence2_length': 125.71, 'max_sentence2_length': 335, 'unique_sentence2': 1012}, 'hne-eng': {'num_samples': 1012, 'number_of_characters': 258911, 'unique_pairs': 1012, 'min_sentence1_length': 42, 'average_sentence1_length': 125.44, 'max_sentence1_length': 327, 'unique_sentence1': 1011, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-hne': {'num_samples': 1012, 'number_of_characters': 258915, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 42, 'average_sentence2_length': 125.44, 'max_sentence2_length': 326, 'unique_sentence2': 1011}, 'raj-eng': {'num_samples': 1012, 'number_of_characters': 261987, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 128.48, 'max_sentence1_length': 338, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-raj': {'num_samples': 1012, 'number_of_characters': 261987, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 128.48, 'max_sentence2_length': 338, 'unique_sentence2': 1012}, 'mai-eng': {'num_samples': 1012, 'number_of_characters': 261374, 'unique_pairs': 1012, 'min_sentence1_length': 36, 'average_sentence1_length': 127.87, 'max_sentence1_length': 350, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mai': {'num_samples': 1012, 'number_of_characters': 261377, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 36, 'average_sentence2_length': 127.88, 'max_sentence2_length': 350, 'unique_sentence2': 1012}, 'mni-eng': {'num_samples': 1012, 'number_of_characters': 268767, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 135.18, 'max_sentence1_length': 353, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mni': {'num_samples': 1012, 'number_of_characters': 268768, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 135.18, 'max_sentence2_length': 354, 'unique_sentence2': 1012}, 'mup-eng': {'num_samples': 1012, 'number_of_characters': 262034, 'unique_pairs': 1012, 'min_sentence1_length': 40, 'average_sentence1_length': 128.53, 'max_sentence1_length': 340, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mup': {'num_samples': 1012, 'number_of_characters': 262034, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 40, 'average_sentence2_length': 128.53, 'max_sentence2_length': 340, 'unique_sentence2': 1012}, 'mwr-eng': {'num_samples': 1012, 'number_of_characters': 263749, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.22, 'max_sentence1_length': 345, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mwr': {'num_samples': 1012, 'number_of_characters': 263749, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.22, 'max_sentence2_length': 345, 'unique_sentence2': 1012}, 'sat-eng': {'num_samples': 1012, 'number_of_characters': 271757, 'unique_pairs': 1012, 'min_sentence1_length': 43, 'average_sentence1_length': 138.13, 'max_sentence1_length': 366, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-sat': {'num_samples': 1012, 'number_of_characters': 271757, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 43, 'average_sentence2_length': 138.13, 'max_sentence2_length': 366, 'unique_sentence2': 1012}}}} | @@ -368,17 +404,17 @@ The following tables give you an overview of the tasks in MTEB. | [IndicQARetrieval](https://arxiv.org/abs/2212.05409) (Sumanth Doddapaneni, 2022) | ['asm', 'ben', 'guj', 'hin', 'kan', 'mal', 'mar', 'ory', 'pan', 'tam', 'tel'] | Retrieval | s2p | [Web, Written] | None | None | | [IndicReviewsClusteringP2P](https://arxiv.org/abs/2212.05409) (Sumanth Doddapaneni, 2022) | ['asm', 'ben', 'brx', 'guj', 'hin', 'kan', 'mal', 'mar', 'ory', 'pan', 'tam', 'tel', 'urd'] | Clustering | p2p | [Reviews, Written] | None | None | | [IndicSentimentClassification](https://arxiv.org/abs/2212.05409) (Sumanth Doddapaneni, 2022) | ['asm', 'ben', 'brx', 'guj', 'hin', 'kan', 'mal', 'mar', 'ory', 'pan', 'tam', 'tel', 'urd'] | Classification | s2s | [Reviews, Written] | None | None | -| [IndonesianIdClickbaitClassification](http://www.sciencedirect.com/science/article/pii/S2352340920311252) (Andika William, 2020) | ['ind'] | Classification | s2s | [News, Written] | None | None | -| [IndonesianMongabayConservationClassification](https://aclanthology.org/2023.sealp-1.4/) (Fransiska et al., 2023) | ['ind'] | Classification | s2s | [Web, Written] | None | None | +| [IndonesianIdClickbaitClassification.v2](http://www.sciencedirect.com/science/article/pii/S2352340920311252) (Andika William, 2020) | ['ind'] | Classification | s2s | [News, Written] | None | None | +| [IndonesianMongabayConservationClassification.v2](https://aclanthology.org/2023.sealp-1.4/) (Fransiska et al., 2023) | ['ind'] | Classification | s2s | [Web, Written] | None | None | | [InfoSeekIT2ITRetrieval](https://aclanthology.org/2023.emnlp-main.925) (Chen et al., 2023) | ['eng'] | Any2AnyRetrieval | it2it | [Encyclopaedic] | {'test': 499375} | {'test': {'number_of_characters': 291755841, 'num_samples': 499375, 'num_queries': 17593, 'num_documents': 481782, 'min_document_length': 8, 'average_document_length': 603.95, 'max_document_length': 4062, 'unique_documents': 481782, 'num_document_images': 481782, 'min_query_length': 21, 'average_query_length': 44.41, 'max_query_length': 87, 'unique_queries': 350, 'num_query_images': 17593, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 7.47, 'max_relevant_docs_per_query': 128, 'unique_relevant_docs': 7891}} | | [InfoSeekIT2TRetrieval](https://aclanthology.org/2023.emnlp-main.925) (Chen et al., 2023) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | {'test': 622974} | {'test': {'number_of_characters': 360966739, 'num_samples': 622974, 'num_queries': 11323, 'num_documents': 611651, 'min_document_length': 9, 'average_document_length': 589.22, 'max_document_length': 7650, 'unique_documents': 611651, 'num_document_images': 0, 'min_query_length': 23, 'average_query_length': 50.18, 'max_query_length': 270, 'unique_queries': 269, 'num_query_images': 11323, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 6.52, 'max_relevant_docs_per_query': 66, 'unique_relevant_docs': 7799}} | | [InsurancePolicyInterpretationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [InternationalCitizenshipQuestionsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [IsiZuluNewsClassification](https://huggingface.co/datasets/dsfsi/za-isizulu-siswati-news) (Madodonga et al., 2023) | ['zul'] | Classification | s2s | [News, Written] | None | None | +| [IsiZuluNewsClassification.v2](https://huggingface.co/datasets/dsfsi/za-isizulu-siswati-news) (Madodonga et al., 2023) | ['zul'] | Classification | s2s | [News, Written] | None | None | | [ItaCaseholdClassification](https://doi.org/10.1145/3594536.3595177) (Licari et al., 2023) | ['ita'] | Classification | s2s | [Government, Legal, Written] | None | None | -| [Itacola](https://aclanthology.org/2021.findings-emnlp.250/) (Trotta et al., 2021) | ['ita'] | Classification | s2s | [Non-fiction, Spoken, Written] | None | None | -| [JCrewBlockerLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [JDReview](https://aclanthology.org/2023.nodalida-1.20/) (Xiao et al., 2023) | ['cmn'] | Classification | s2s | | None | None | +| [Itacola.v2](https://aclanthology.org/2021.findings-emnlp.250/) (Trotta et al., 2021) | ['ita'] | Classification | s2s | [Non-fiction, Spoken, Written] | None | None | +| [JCrewBlockerLegalBenchClassification.v2](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [JDReview.v2](https://aclanthology.org/2023.nodalida-1.20/) (Xiao et al., 2023) | ['cmn'] | Classification | s2s | | None | None | | [JQaRAReranking](https://huggingface.co/datasets/hotchpotch/JQaRA) | ['jpn'] | Reranking | s2s | [Encyclopaedic, Non-fiction, Written] | None | None | | [JSICK](https://github.com/sbintuitions/JMTEB) (Yanaka et al., 2022) | ['jpn'] | STS | s2s | [Web, Written] | None | None | | [JSTS](https://aclanthology.org/2022.lrec-1.317.pdf#page=2.00) (Kurihara et al., 2022) | ['jpn'] | STS | s2s | [Web, Written] | None | None | @@ -386,22 +422,66 @@ The following tables give you an overview of the tasks in MTEB. | [JaCWIRRetrieval](https://huggingface.co/datasets/hotchpotch/JaCWIR) | ['jpn'] | Retrieval | s2s | [Web, Written] | None | None | | [JaGovFaqsRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Web, Written] | None | None | | [JaQuADRetrieval](https://arxiv.org/abs/2202.01764) (ByungHoon So, 2022) | ['jpn'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Written] | None | None | +| [JapaneseSentimentClassification](https://huggingface.co/datasets/mteb/multilingual-sentiment-classification) (Mollanorozy et al., 2023) | ['jpn'] | Classification | s2s | [Reviews, Written] | None | None | | [JaqketRetrieval](https://github.com/kumapo/JAQKET-dataset) (鈴木正敏, 2020) | ['jpn'] | Retrieval | s2p | [Encyclopaedic, Non-fiction, Written] | {'test': 115226} | {'test': {'number_of_characters': 428294530, 'num_samples': 115226, 'num_queries': 997, 'num_documents': 114229, 'min_document_length': 16, 'average_document_length': 0.44, 'max_document_length': 98, 'unique_documents': 114229, 'min_query_length': 8, 'average_query_length': 429532.57, 'max_query_length': 188424, 'unique_queries': 997, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 989}} | -| [JavaneseIMDBClassification](https://github.com/w11wo/nlp-datasets#javanese-imdb) (Wongso et al., 2021) | ['jav'] | Classification | s2s | [Reviews, Written] | None | None | +| [JavaneseIMDBClassification.v2](https://github.com/w11wo/nlp-datasets#javanese-imdb) (Wongso et al., 2021) | ['jav'] | Classification | s2s | [Reviews, Written] | None | None | +| [JinaVDRAirbnbSyntheticRetrieval](https://huggingface.co/datasets/jinaai/airbnb-synthetic-retrieval_beir) (Michael Günther, 2025) | ['ara', 'deu', 'eng', 'fra', 'hin', 'hun', 'jpn', 'rus', 'spa', 'zho'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDRArabicChartQARetrieval](https://huggingface.co/datasets/jinaai/arabic_chartqa_ar_beir) (Michael Günther, 2025) | ['ara'] | DocumentUnderstanding | t2i | [Academic] | None | None | +| [JinaVDRArabicInfographicsVQARetrieval](https://huggingface.co/datasets/jinaai/arabic_infographicsvqa_ar_beir) (Michael Günther, 2025) | ['ara'] | DocumentUnderstanding | t2i | [Academic] | None | None | +| [JinaVDRArxivQARetrieval](https://huggingface.co/datasets/jinaai/arxivqa_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDRAutomobileCatelogRetrieval](https://huggingface.co/datasets/jinaai/automobile_catalogue_jp_beir) (Michael Günther, 2025) | ['jpn'] | DocumentUnderstanding | t2i | [Engineering, Web] | None | None | +| [JinaVDRBeveragesCatalogueRetrieval](https://huggingface.co/datasets/jinaai/beverages_catalogue_ru_beir) (Michael Günther, 2025) | ['rus'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDRCharXivOCRRetrieval](https://huggingface.co/datasets/jinaai/CharXiv-en_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDRChartQARetrieval](https://huggingface.co/datasets/jinaai/ChartQA_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDRDocQAAI](https://huggingface.co/datasets/jinaai/docqa_artificial_intelligence_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDRDocQAEnergyRetrieval](https://huggingface.co/datasets/jinaai/docqa_energy_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDRDocQAGovReportRetrieval](https://huggingface.co/datasets/jinaai/docqa_gov_report_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Government] | None | None | +| [JinaVDRDocQAHealthcareIndustryRetrieval](https://huggingface.co/datasets/jinaai/docqa_healthcare_industry_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Medical] | None | None | +| [JinaVDRDocVQARetrieval](https://huggingface.co/datasets/jinaai/docvqa_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDRDonutVQAISynHMPRetrieval](https://huggingface.co/datasets/jinaai/donut_vqa_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Medical] | None | None | +| [JinaVDREuropeanaDeNewsRetrieval](https://huggingface.co/datasets/jinaai/europeana-de-news_beir) (Michael Günther, 2025) | ['deu'] | DocumentUnderstanding | t2i | [News] | None | None | +| [JinaVDREuropeanaEsNewsRetrieval](https://huggingface.co/datasets/jinaai/europeana-es-news_beir) (Michael Günther, 2025) | ['spa'] | DocumentUnderstanding | t2i | [News] | None | None | +| [JinaVDREuropeanaFrNewsRetrieval](https://huggingface.co/datasets/jinaai/europeana-fr-news_beir) (Michael Günther, 2025) | ['fra'] | DocumentUnderstanding | t2i | [News] | None | None | +| [JinaVDREuropeanaItScansRetrieval](https://huggingface.co/datasets/jinaai/europeana-it-scans_beir) (Michael Günther, 2025) | ['ita'] | DocumentUnderstanding | t2i | [News] | None | None | +| [JinaVDREuropeanaNlLegalRetrieval](https://huggingface.co/datasets/jinaai/europeana-nl-legal_beir) (Michael Günther, 2025) | ['nld'] | DocumentUnderstanding | t2i | [Legal] | None | None | +| [JinaVDRGitHubReadmeRetrieval](https://huggingface.co/datasets/jinaai/github-readme-retrieval-multilingual_beir) (Michael Günther, 2025) | ['ara', 'ben', 'deu', 'eng', 'fra', 'hin', 'ind', 'ita', 'jpn', 'kor', 'nld', 'por', 'rus', 'spa', 'tha', 'vie', 'zho'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDRHindiGovVQARetrieval](https://huggingface.co/datasets/jinaai/hindi-gov-vqa_beir) (Michael Günther, 2025) | ['hin'] | DocumentUnderstanding | t2i | [Government] | None | None | +| [JinaVDRHungarianDocQARetrieval](https://huggingface.co/datasets/jinaai/hungarian_doc_qa_beir) (Michael Günther, 2025) | ['hun'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDRInfovqaRetrieval](https://huggingface.co/datasets/jinaai/infovqa_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDRJDocQARetrieval](https://huggingface.co/datasets/jinaai/jdocqa_beir) (Michael Günther, 2025) | ['jpn'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDRJina2024YearlyBookRetrieval](https://huggingface.co/datasets/jinaai/jina_2024_yearly_book_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDRMMTabRetrieval](https://huggingface.co/datasets/jinaai/MMTab_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDRMPMQARetrieval](https://huggingface.co/datasets/jinaai/mpmqa_small_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDRMedicalPrescriptionsRetrieval](https://huggingface.co/datasets/jinaai/medical-prescriptions_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Medical] | None | None | +| [JinaVDROWIDChartsRetrieval](https://huggingface.co/datasets/jinaai/owid_charts_en_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDROpenAINewsRetrieval](https://huggingface.co/datasets/jinaai/openai-news_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [News, Web] | None | None | +| [JinaVDRPlotQARetrieval](https://huggingface.co/datasets/jinaai/plotqa_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDRRamensBenchmarkRetrieval](https://huggingface.co/datasets/jinaai/ramen_benchmark_jp_beir) (Michael Günther, 2025) | ['jpn'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDRShanghaiMasterPlanRetrieval](https://huggingface.co/datasets/jinaai/shanghai_master_plan_beir) (Michael Günther, 2025) | ['zho'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDRShiftProjectRetrieval](https://huggingface.co/datasets/jinaai/shiftproject_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDRStanfordSlideRetrieval](https://huggingface.co/datasets/jinaai/stanford_slide_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Academic] | None | None | +| [JinaVDRStudentEnrollmentSyntheticRetrieval](https://huggingface.co/datasets/jinaai/student-enrollment_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Academic] | None | None | +| [JinaVDRTQARetrieval](https://huggingface.co/datasets/jinaai/tqa_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Academic] | None | None | +| [JinaVDRTabFQuadRetrieval](https://huggingface.co/datasets/jinaai/tabfquad_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Academic] | None | None | +| [JinaVDRTableVQARetrieval](https://huggingface.co/datasets/jinaai/table-vqa_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Academic] | None | None | +| [JinaVDRTatQARetrieval](https://huggingface.co/datasets/jinaai/tatqa_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDRTweetStockSyntheticsRetrieval](https://huggingface.co/datasets/jinaai/tweet-stock-synthetic-retrieval_beir) (Michael Günther, 2025) | ['ara', 'deu', 'eng', 'fra', 'hin', 'hun', 'jpn', 'rus', 'spa', 'zho'] | DocumentUnderstanding | t2i | [Social] | None | None | +| [JinaVDRWikimediaCommonsDocumentsRetrieval](https://huggingface.co/datasets/jinaai/wikimedia-commons-documents-ml_beir) (Michael Günther, 2025) | ['ara', 'ben', 'deu', 'eng', 'fra', 'hin', 'hun', 'ind', 'ita', 'jpn', 'kor', 'mya', 'nld', 'por', 'rus', 'spa', 'tha', 'urd', 'vie', 'zho'] | DocumentUnderstanding | t2i | [Web] | None | None | +| [JinaVDRWikimediaCommonsMapsRetrieval](https://huggingface.co/datasets/jinaai/wikimedia-commons-maps_beir) (Michael Günther, 2025) | ['eng'] | DocumentUnderstanding | t2i | [Web] | None | None | | [KLUE-NLI](https://arxiv.org/abs/2105.09680) (Sungjoon Park, 2021) | ['kor'] | PairClassification | s2s | [Encyclopaedic, News, Written] | None | None | | [KLUE-STS](https://arxiv.org/abs/2105.09680) (Sungjoon Park, 2021) | ['kor'] | STS | s2s | [News, Reviews, Spoken, Spoken, Written] | None | None | -| [KLUE-TC](https://arxiv.org/abs/2105.09680) (Sungjoon Park, 2021) | ['kor'] | Classification | s2s | [News, Written] | None | None | -| [KannadaNewsClassification](https://github.com/goru001/nlp-for-kannada) (Anoop Kunchukuttan, 2020) | ['kan'] | Classification | s2s | [News, Written] | None | None | +| [KLUE-TC.v2](https://arxiv.org/abs/2105.09680) (Sungjoon Park, 2021) | ['kor'] | Classification | s2s | [News, Written] | None | None | +| [KannadaNewsClassification.v2](https://github.com/goru001/nlp-for-kannada) (Anoop Kunchukuttan, 2020) | ['kan'] | Classification | s2s | [News, Written] | None | None | | [KinopoiskClassification](https://www.dialog-21.ru/media/1226/blinovpd.pdf) (Blinov et al., 2013) | ['rus'] | Classification | p2p | [Reviews, Written] | None | None | | [KlueMrcDomainClustering](https://huggingface.co/datasets/on-and-on/clustering_klue_mrc_context_domain) (Sungjoon Park, 2021) | ['kor'] | Clustering | p2p | [News, Written] | None | None | | [KlueYnatMrcCategoryClustering](https://huggingface.co/datasets/on-and-on/clustering_klue_mrc_ynat_title) (Sungjoon Park, 2021) | ['kor'] | Clustering | s2s | [News, Written] | None | None | | Ko-StrategyQA (Geva et al., 2021) | ['kor'] | Retrieval | s2p | | None | None | | [KorFin](https://huggingface.co/datasets/amphora/korfin-asc) (Son et al., 2023) | ['kor'] | Classification | s2s | [Financial, News, Written] | None | None | -| [KorHateClassification](https://paperswithcode.com/dataset/korean-hatespeech-dataset) (Jihyung Moon, 2020) | ['kor'] | Classification | s2s | [Social, Written] | None | None | +| [KorHateClassification.v2](https://paperswithcode.com/dataset/korean-hatespeech-dataset) (Jihyung Moon, 2020) | ['kor'] | Classification | s2s | [Social, Written] | None | None | | [KorHateSpeechMLClassification](https://paperswithcode.com/dataset/korean-multi-label-hate-speech-dataset) (Lee et al., 2022) | ['kor'] | MultilabelClassification | s2s | [Social, Written] | None | None | | [KorSTS](https://arxiv.org/abs/2004.03289) (Ham et al., 2020) | ['kor'] | STS | s2s | [News, Web] | None | None | -| [KorSarcasmClassification](https://github.com/SpellOnYou/korean-sarcasm) (Kim et al., 2019) | ['kor'] | Classification | s2s | [Social, Written] | None | None | -| [KurdishSentimentClassification](https://link.springer.com/article/10.1007/s10579-023-09716-6) (Badawi et al., 2024) | ['kur'] | Classification | s2s | [Web, Written] | None | None | +| [KorSarcasmClassification.v2](https://github.com/SpellOnYou/korean-sarcasm) (Kim et al., 2019) | ['kor'] | Classification | s2s | [Social, Written] | None | None | +| [KurdishSentimentClassification.v2](https://link.springer.com/article/10.1007/s10579-023-09716-6) (Badawi et al., 2024) | ['kur'] | Classification | s2s | [Web, Written] | None | None | | [LCQMC](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None | | [LEMBNarrativeQARetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) (Ko{\v{c, 2018) | ['eng'] | Retrieval | s2p | [Fiction, Non-fiction, Written] | None | None | | [LEMBNeedleRetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) (Zhu et al., 2024) | ['eng'] | Retrieval | s2p | [Academic, Blog, Written] | None | None | @@ -433,12 +513,13 @@ The following tables give you an overview of the tasks in MTEB. | [LegalBenchCorporateLobbying](https://huggingface.co/datasets/nguha/legalbench/viewer/corporate_lobbying) (Neel Guha, 2023) | ['eng'] | Retrieval | s2p | [Legal, Written] | None | None | | [LegalBenchPC](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | PairClassification | s2s | [Legal, Written] | None | None | | [LegalQuAD](https://github.com/Christoph911/AIKE2021_Appendix) (Hoppe et al., 2021) | ['deu'] | Retrieval | s2p | [Legal, Written] | None | None | -| [LegalReasoningCausalityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [LegalReasoningCausalityLegalBenchClassification.v2](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [LegalSummarization](https://github.com/lauramanor/legal_summarization) (Manor et al., 2019) | ['eng'] | Retrieval | s2p | [Legal, Written] | None | None | | [LinceMTBitextMining](https://ritual.uh.edu/lince/) (Aguilar et al., 2020) | ['eng', 'hin'] | BitextMining | s2s | [Social, Written] | None | None | | [LitSearchRetrieval](https://github.com/princeton-nlp/LitSearch) (Ajith et al., 2024) | ['eng'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | | [LivedoorNewsClustering.v2](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Clustering | s2s | [News, Written] | None | None | -| [MAUDLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [MAUDLegalBenchClassification.v2](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [MBPPRetrieval](https://huggingface.co/datasets/embedding-benchmark/MBPP) (Austin et al., 2021) | ['eng', 'python'] | Retrieval | s2s | [Programming] | {'test': 1948} | {'test': {'number_of_characters': 78520, 'num_samples': 1948, 'num_queries': 974, 'num_documents': 974, 'min_document_length': 37, 'average_document_length': 78.62, 'max_document_length': 249, 'unique_documents': 974, 'min_query_length': 2, 'average_query_length': 2.0, 'max_query_length': 2, 'unique_queries': 974, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 974}} | | [METI2IRetrieval](https://arxiv.org/abs/2202.01747) (Ypsilantis et al., 2021) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | {'test': 348597} | {'test': {'number_of_characters': 0, 'num_samples': 348597, 'num_queries': 87942, 'num_documents': 260655, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 260655, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 87942, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.96, 'max_relevant_docs_per_query': 9, 'unique_relevant_docs': 172713}} | | [MIRACLReranking](https://project-miracl.github.io/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Reranking | s2s | [Encyclopaedic, Written] | None | None | | [MIRACLRetrieval](http://miracl.ai/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | @@ -459,19 +540,24 @@ The following tables give you an overview of the tasks in MTEB. | [MSMARCO-Fa](https://huggingface.co/datasets/MCINext/msmarco-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [MSMARCO-PL](https://microsoft.github.io/msmarco/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | None | | [MSMARCO-PLHardNegatives](https://microsoft.github.io/msmarco/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | None | +| [MSMARCO-VN](https://microsoft.github.io/msmarco/) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Academic, Blog, Encyclopaedic, Government, Medical, News, Non-fiction, Reviews, Social, Web] | None | None | | [MSMARCOHardNegatives](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Academic, Blog, Encyclopaedic, Government, Medical, News, Non-fiction, Reviews, Social, Web] | None | None | | [MSMARCOv2](https://microsoft.github.io/msmarco/TREC-Deep-Learning.html) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Academic, Blog, Encyclopaedic, Government, Medical, News, Non-fiction, Reviews, Social, Web] | None | None | | [MTOPDomainClassification](https://arxiv.org/pdf/2008.09335.pdf) (Li et al., 2021) | ['deu', 'eng', 'fra', 'hin', 'spa', 'tha'] | Classification | s2s | [Spoken, Spoken] | None | None | +| [MTOPDomainVNClassification](https://arxiv.org/pdf/2008.09335.pdf) (Loc Pham, 2025) | ['vie'] | Classification | s2s | [Spoken, Spoken] | None | None | | [MTOPIntentClassification](https://arxiv.org/pdf/2008.09335.pdf) (Li et al., 2021) | ['deu', 'eng', 'fra', 'hin', 'spa', 'tha'] | Classification | s2s | [Spoken, Spoken] | None | None | -| [MacedonianTweetSentimentClassification](https://aclanthology.org/R15-1034/) (Jovanoski et al., 2015) | ['mkd'] | Classification | s2s | [Social, Written] | None | None | -| [MalayalamNewsClassification](https://github.com/goru001/nlp-for-malyalam) (Anoop Kunchukuttan, 2020) | ['mal'] | Classification | s2s | [News, Written] | None | None | +| [MTOPIntentVNClassification](https://arxiv.org/pdf/2008.09335.pdf) (Loc Pham, 2025) | ['vie'] | Classification | s2s | [Spoken, Spoken] | None | None | +| [MacedonianTweetSentimentClassification.v2](https://aclanthology.org/R15-1034/) (Jovanoski et al., 2015) | ['mkd'] | Classification | s2s | [Social, Written] | None | None | +| [MalayalamNewsClassification.v2](https://github.com/goru001/nlp-for-malyalam) (Anoop Kunchukuttan, 2020) | ['mal'] | Classification | s2s | [News, Written] | None | None | | [MalteseNewsClassification](https://huggingface.co/datasets/MLRS/maltese_news_categories) (Chaudhary et al., 2024) | ['mlt'] | MultilabelClassification | s2s | [Constructed, Written] | None | None | -| [MarathiNewsClassification](https://github.com/goru001/nlp-for-marathi) (Anoop Kunchukuttan, 2020) | ['mar'] | Classification | s2s | [News, Written] | None | None | +| [MarathiNewsClassification.v2](https://github.com/goru001/nlp-for-marathi) (Anoop Kunchukuttan, 2020) | ['mar'] | Classification | s2s | [News, Written] | None | None | | [MasakhaNEWSClassification](https://arxiv.org/abs/2304.09972) (David Ifeoluwa Adelani, 2023) | ['amh', 'eng', 'fra', 'hau', 'ibo', 'lin', 'lug', 'orm', 'pcm', 'run', 'sna', 'som', 'swa', 'tir', 'xho', 'yor'] | Classification | s2s | [News, Written] | None | None | | [MasakhaNEWSClusteringP2P](https://huggingface.co/datasets/masakhane/masakhanews) (David Ifeoluwa Adelani, 2023) | ['amh', 'eng', 'fra', 'hau', 'ibo', 'lin', 'lug', 'orm', 'pcm', 'run', 'sna', 'som', 'swa', 'tir', 'xho', 'yor'] | Clustering | p2p | [News, Non-fiction, Written] | None | None | | [MasakhaNEWSClusteringS2S](https://huggingface.co/datasets/masakhane/masakhanews) (David Ifeoluwa Adelani, 2023) | ['amh', 'eng', 'fra', 'hau', 'ibo', 'lin', 'lug', 'orm', 'pcm', 'run', 'sna', 'som', 'swa', 'tir', 'xho', 'yor'] | Clustering | s2s | [News, Written] | None | None | | [MassiveIntentClassification](https://arxiv.org/abs/2204.08582) (Jack FitzGerald, 2022) | ['afr', 'amh', 'ara', 'aze', 'ben', 'cmo', 'cym', 'dan', 'deu', 'ell', 'eng', 'fas', 'fin', 'fra', 'heb', 'hin', 'hun', 'hye', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kan', 'kat', 'khm', 'kor', 'lav', 'mal', 'mon', 'msa', 'mya', 'nld', 'nob', 'pol', 'por', 'ron', 'rus', 'slv', 'spa', 'sqi', 'swa', 'swe', 'tam', 'tel', 'tgl', 'tha', 'tur', 'urd', 'vie'] | Classification | s2s | [Spoken] | None | None | +| [MassiveIntentVNClassification](https://arxiv.org/abs/2204.08582#:~:text=MASSIVE%20contains%201M%20realistic%2C%20parallel,diverse%20languages%20from%2029%20genera.) (Loc Pham, 2025) | ['vie'] | Classification | s2s | [Spoken] | None | None | | [MassiveScenarioClassification](https://arxiv.org/abs/2204.08582) (Jack FitzGerald, 2022) | ['afr', 'amh', 'ara', 'aze', 'ben', 'cmo', 'cym', 'dan', 'deu', 'ell', 'eng', 'fas', 'fin', 'fra', 'heb', 'hin', 'hun', 'hye', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kan', 'kat', 'khm', 'kor', 'lav', 'mal', 'mon', 'msa', 'mya', 'nld', 'nob', 'pol', 'por', 'ron', 'rus', 'slv', 'spa', 'sqi', 'swa', 'swe', 'tam', 'tel', 'tgl', 'tha', 'tur', 'urd', 'vie'] | Classification | s2s | [Spoken] | None | None | +| [MassiveScenarioVNClassification](https://arxiv.org/abs/2204.08582#:~:text=MASSIVE%20contains%201M%20realistic%2C%20parallel,diverse%20languages%20from%2029%20genera.) (Loc Pham, 2025) | ['vie'] | Classification | s2s | [Spoken] | None | None | | [MedicalQARetrieval](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-3119-4) (Asma et al., 2019) | ['eng'] | Retrieval | s2s | [Medical, Written] | None | None | | [MedicalRetrieval](https://arxiv.org/abs/2203.03367) (Dingkun Long, 2022) | ['cmn'] | Retrieval | s2p | | None | None | | [MedrxivClusteringP2P.v2](https://api.medrxiv.org/) | ['eng'] | Clustering | p2p | [Academic, Medical, Written] | {'test': 37500} | {'test': {'num_samples': 37500, 'number_of_characters': 74294927, 'min_text_length': 148, 'average_text_length': 1981.2, 'max_text_length': 38759, 'min_labels_per_text': 6, 'average_labels_per_text': 1.0, 'max_labels_per_text': 8830, 'unique_labels': 51, 'labels': {'epidemiology': {'count': 6656}, 'public and global health': {'count': 3595}, 'oncology': {'count': 845}, 'allergy and immunology': {'count': 464}, 'orthopedics': {'count': 104}, 'health informatics': {'count': 1107}, 'occupational and environmental health': {'count': 415}, 'infectious diseases': {'count': 8830}, 'genetic and genomic medicine': {'count': 1918}, 'health policy': {'count': 527}, 'gastroenterology': {'count': 343}, 'radiology and imaging': {'count': 541}, 'pain medicine': {'count': 121}, 'neurology': {'count': 1773}, 'primary care research': {'count': 232}, 'rheumatology': {'count': 189}, 'endocrinology': {'count': 419}, 'hematology': {'count': 202}, 'addiction medicine': {'count': 178}, 'pediatrics': {'count': 589}, 'cardiovascular medicine': {'count': 855}, 'obstetrics and gynecology': {'count': 373}, 'health systems and quality improvement': {'count': 491}, 'nephrology': {'count': 241}, 'respiratory medicine': {'count': 482}, 'geriatric medicine': {'count': 169}, 'dentistry and oral medicine': {'count': 159}, 'psychiatry and clinical psychology': {'count': 1781}, 'nutrition': {'count': 240}, 'intensive care and critical care medicine': {'count': 368}, 'rehabilitation medicine and physical therapy': {'count': 322}, 'otolaryngology': {'count': 166}, 'nursing': {'count': 93}, 'transplantation': {'count': 118}, 'health economics': {'count': 327}, 'sports medicine': {'count': 180}, 'hiv aids': {'count': 363}, 'dermatology': {'count': 98}, 'pathology': {'count': 223}, 'emergency medicine': {'count': 191}, 'pharmacology and therapeutics': {'count': 221}, 'ophthalmology': {'count': 220}, 'medical ethics': {'count': 46}, 'palliative medicine': {'count': 45}, 'sexual and reproductive health': {'count': 156}, 'medical education': {'count': 203}, 'surgery': {'count': 162}, 'urology': {'count': 65}, 'anesthesia': {'count': 72}, 'toxicology': {'count': 16}, 'forensic medicine': {'count': 6}}}} | @@ -481,19 +567,20 @@ The following tables give you an overview of the tasks in MTEB. | [MewsC16JaClustering](https://github.com/sbintuitions/JMTEB) (Nishikawa et al., 2022) | ['jpn'] | Clustering | s2s | [News, Written] | None | None | | [MindSmallReranking](https://msnews.github.io/assets/doc/ACL2020_MIND.pdf) (Wu et al., 2020) | ['eng'] | Reranking | s2s | [News, Written] | None | None | | MintakaRetrieval (Sen et al., 2022) | ['ara', 'deu', 'fra', 'hin', 'ita', 'jpn', 'por', 'spa'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | -| [Moroco](https://huggingface.co/datasets/moroco) (Andrei M. Butnaru, 2019) | ['ron'] | Classification | s2s | [News, Written] | None | None | -| [MovieReviewSentimentClassification](https://github.com/TheophileBlard/french-sentiment-analysis-with-bert) (Théophile Blard, 2020) | ['fra'] | Classification | s2s | [Reviews, Written] | None | None | +| [Moroco.v2](https://huggingface.co/datasets/moroco) (Andrei M. Butnaru, 2019) | ['ron'] | Classification | s2s | [News, Written] | None | None | +| [MovieReviewSentimentClassification.v2](https://github.com/TheophileBlard/french-sentiment-analysis-with-bert) (Théophile Blard, 2020) | ['fra'] | Classification | s2s | [Reviews, Written] | None | None | | [MrTidyRetrieval](https://huggingface.co/datasets/castorini/mr-tydi) (Xinyu Zhang, 2021) | ['ara', 'ben', 'eng', 'fin', 'ind', 'jpn', 'kor', 'rus', 'swa', 'tel', 'tha'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [MultiEURLEXMultilabelClassification](https://huggingface.co/datasets/coastalcph/multi_eurlex) (Chalkidis et al., 2021) | ['bul', 'ces', 'dan', 'deu', 'ell', 'eng', 'est', 'fin', 'fra', 'hrv', 'hun', 'ita', 'lav', 'lit', 'mlt', 'nld', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe'] | MultilabelClassification | p2p | [Government, Legal, Written] | None | None | | [MultiHateClassification](https://aclanthology.org/2022.woah-1.15/) (R{\"o, 2021) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'nld', 'pol', 'por', 'spa'] | Classification | s2s | [Constructed, Written] | None | None | | [MultiLongDocRetrieval](https://arxiv.org/abs/2402.03216) (Jianlv Chen, 2024) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'jpn', 'kor', 'por', 'rus', 'spa', 'tha'] | Retrieval | s2p | [Encyclopaedic, Fiction, Non-fiction, Web, Written] | None | None | -| [MultilingualSentiment](https://github.com/tyqiangz/multilingual-sentiment-datasets) | ['cmn'] | Classification | s2s | | None | None | +| [MultilingualSentiment.v2](https://github.com/tyqiangz/multilingual-sentiment-datasets) | ['cmn'] | Classification | s2s | | None | None | | [MultilingualSentimentClassification](https://huggingface.co/datasets/mteb/multilingual-sentiment-classification) (Mollanorozy et al., 2023) | ['ara', 'bam', 'bul', 'cmn', 'cym', 'deu', 'dza', 'ell', 'eng', 'eus', 'fas', 'fin', 'heb', 'hrv', 'ind', 'jpn', 'kor', 'mlt', 'nor', 'pol', 'rus', 'slk', 'spa', 'tha', 'tur', 'uig', 'urd', 'vie', 'zho'] | Classification | s2s | [Reviews, Written] | None | None | -| [MyanmarNews](https://huggingface.co/datasets/myanmar_news) (A. H. Khine, 2017) | ['mya'] | Classification | p2p | [News, Written] | None | None | +| [MyanmarNews.v2](https://huggingface.co/datasets/myanmar_news) (A. H. Khine, 2017) | ['mya'] | Classification | p2p | [News, Written] | None | None | | [NFCorpus](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | {'test': 3956} | {'test': {'number_of_characters': 1612.55, 'num_samples': 3956, 'num_queries': 323, 'num_documents': 3633, 'average_document_length': 0.44, 'average_query_length': 0.07, 'average_relevant_docs_per_query': 38.19}} | | [NFCorpus-Fa](https://huggingface.co/datasets/MCINext/nfcorpus-fa) | ['fas'] | Retrieval | s2p | [Medical] | None | None | | [NFCorpus-NL](https://huggingface.co/datasets/clips/beir-nl-nfcorpus) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [NFCorpus-PL](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | +| [NFCorpus-VN](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [NIGHTSI2IRetrieval](https://proceedings.neurips.cc/paper_files/paper/2023/hash/9f09f316a3eaf59d9ced5ffaefe97e0f-Abstract-Conference.html) (Fu et al., 2024) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | {'test': 42158} | {'test': {'number_of_characters': 0, 'num_samples': 42158, 'num_queries': 2120, 'num_documents': 40038, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 40038, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 2120, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2120}} | | [NLPJournalAbsArticleRetrieval](https://huggingface.co/datasets/sbintuitions/JMTEB) (Li et al., 2024) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | | [NLPJournalAbsArticleRetrieval.V2](https://huggingface.co/datasets/sbintuitions/JMTEB) (Li et al., 2024) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | @@ -503,13 +590,14 @@ The following tables give you an overview of the tasks in MTEB. | [NLPJournalTitleAbsRetrieval.V2](https://huggingface.co/datasets/sbintuitions/JMTEB) (Li et al., 2024) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | | [NLPJournalTitleIntroRetrieval](https://huggingface.co/datasets/sbintuitions/JMTEB) (Li et al., 2024) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | | [NLPJournalTitleIntroRetrieval.V2](https://huggingface.co/datasets/sbintuitions/JMTEB) (Li et al., 2024) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | -| [NLPTwitterAnalysisClassification](https://huggingface.co/datasets/hamedhf/nlp_twitter_analysis/tree/main) | ['fas'] | Classification | s2p | [Social] | None | None | +| [NLPTwitterAnalysisClassification.v2](https://huggingface.co/datasets/hamedhf/nlp_twitter_analysis/tree/main) | ['fas'] | Classification | s2p | [Social] | None | None | | [NLPTwitterAnalysisClustering](https://huggingface.co/datasets/hamedhf/nlp_twitter_analysis/commits/main) | ['fas'] | Clustering | s2s | [Social] | None | None | | [NQ](https://ai.google.com/research/NaturalQuestions/) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [NQ-Fa](https://huggingface.co/datasets/MCINext/nq-fa) | ['fas'] | Retrieval | s2p | [Encyclopaedic] | None | None | | [NQ-NL](https://huggingface.co/datasets/clips/beir-nl-nq) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [NQ-PL](https://ai.google.com/research/NaturalQuestions/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | | [NQ-PLHardNegatives](https://ai.google.com/research/NaturalQuestions/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | +| [NQ-VN](https://ai.google.com/research/NaturalQuestions/) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [NQHardNegatives](https://ai.google.com/research/NaturalQuestions/) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | | None | None | | [NTREXBitextMining](https://huggingface.co/datasets/davidstap/NTREX) (Federmann et al., 2022) | ['afr', 'amh', 'arb', 'aze', 'bak', 'bel', 'bem', 'ben', 'bod', 'bos', 'bul', 'cat', 'ces', 'ckb', 'cym', 'dan', 'deu', 'div', 'dzo', 'ell', 'eng', 'eus', 'ewe', 'fao', 'fas', 'fij', 'fil', 'fin', 'fra', 'fuc', 'gle', 'glg', 'guj', 'hau', 'heb', 'hin', 'hmn', 'hrv', 'hun', 'hye', 'ibo', 'ind', 'isl', 'ita', 'jpn', 'kan', 'kat', 'kaz', 'khm', 'kin', 'kir', 'kmr', 'kor', 'lao', 'lav', 'lit', 'ltz', 'mal', 'mar', 'mey', 'mkd', 'mlg', 'mlt', 'mon', 'mri', 'msa', 'mya', 'nde', 'nep', 'nld', 'nno', 'nob', 'nso', 'nya', 'orm', 'pan', 'pol', 'por', 'prs', 'pus', 'ron', 'rus', 'shi', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'spa', 'sqi', 'srp', 'ssw', 'swa', 'swe', 'tah', 'tam', 'tat', 'tel', 'tgk', 'tha', 'tir', 'ton', 'tsn', 'tuk', 'tur', 'uig', 'ukr', 'urd', 'uzb', 'ven', 'vie', 'wol', 'xho', 'yor', 'yue', 'zho', 'zul'] | BitextMining | s2s | [News, Written] | {'test': 3826252} | {'test': {'num_samples': 3826252, 'number_of_characters': 988355274, 'unique_pairs': 3820263, 'min_sentence1_length': 1, 'average_sentence1_length': 129.15, 'max_sentence1_length': 773, 'unique_sentence1': 241259, 'min_sentence2_length': 1, 'average_sentence2_length': 129.15, 'max_sentence2_length': 773, 'unique_sentence2': 241259, 'hf_subset_descriptive_stats': {'afr_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 520490, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'afr_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 564002, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'afr_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 516072, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'afr_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 526155, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'afr_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 530560, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'afr_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 549109, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'afr_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 560267, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'afr_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 516709, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'afr_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 519796, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'afr_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 520179, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'amh_Ethi-eng_Latn': {'num_samples': 1997, 'number_of_characters': 415227, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'amh_Ethi-hau_Latn': {'num_samples': 1997, 'number_of_characters': 437473, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'amh_Ethi-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 413608, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'amh_Ethi-nso_Latn': {'num_samples': 1997, 'number_of_characters': 459006, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'amh_Ethi-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 404938, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'amh_Ethi-som_Latn': {'num_samples': 1997, 'number_of_characters': 458799, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'amh_Ethi-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 455649, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'amh_Ethi-swa_Latn': {'num_samples': 1997, 'number_of_characters': 440016, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'amh_Ethi-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 332745, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'amh_Ethi-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 501790, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'amh_Ethi-wol_Latn': {'num_samples': 1997, 'number_of_characters': 407310, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'amh_Ethi-xho_Latn': {'num_samples': 1997, 'number_of_characters': 435597, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'amh_Ethi-yor_Latn': {'num_samples': 1997, 'number_of_characters': 483595, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'amh_Ethi-zul_Latn': {'num_samples': 1997, 'number_of_characters': 425239, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'arb_Arab-ben_Beng': {'num_samples': 1997, 'number_of_characters': 474983, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'arb_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 483548, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'arb_Arab-deu_Latn': {'num_samples': 1997, 'number_of_characters': 526831, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'arb_Arab-ell_Grek': {'num_samples': 1997, 'number_of_characters': 530308, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'arb_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 478901, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'arb_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 474520, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'arb_Arab-fin_Latn': {'num_samples': 1997, 'number_of_characters': 500981, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'arb_Arab-fra_Latn': {'num_samples': 1997, 'number_of_characters': 524289, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'arb_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 431477, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'arb_Arab-hin_Deva': {'num_samples': 1997, 'number_of_characters': 492756, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'arb_Arab-hun_Latn': {'num_samples': 1997, 'number_of_characters': 509557, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'arb_Arab-ind_Latn': {'num_samples': 1997, 'number_of_characters': 518153, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'arb_Arab-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 342807, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'arb_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 477127, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'arb_Arab-kor_Hang': {'num_samples': 1997, 'number_of_characters': 364586, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'arb_Arab-lit_Latn': {'num_samples': 1997, 'number_of_characters': 490578, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'arb_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 445016, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'arb_Arab-nld_Latn': {'num_samples': 1997, 'number_of_characters': 523096, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'arb_Arab-pol_Latn': {'num_samples': 1997, 'number_of_characters': 509047, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'arb_Arab-por_Latn': {'num_samples': 1997, 'number_of_characters': 508396, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'arb_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 473717, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'arb_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 473814, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'arb_Arab-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 506074, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'arb_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 446094, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'arb_Arab-spa_Latn': {'num_samples': 1997, 'number_of_characters': 519381, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'arb_Arab-swa_Latn': {'num_samples': 1997, 'number_of_characters': 503690, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'arb_Arab-swe_Latn': {'num_samples': 1997, 'number_of_characters': 483008, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'arb_Arab-tam_Taml': {'num_samples': 1997, 'number_of_characters': 541142, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'arb_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 505328, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'arb_Arab-tur_Latn': {'num_samples': 1997, 'number_of_characters': 496794, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'arb_Arab-vie_Latn': {'num_samples': 1997, 'number_of_characters': 502302, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'arb_Arab-zho_Hant': {'num_samples': 1997, 'number_of_characters': 322659, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'arb_Arab-zul_Latn': {'num_samples': 1997, 'number_of_characters': 488913, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'aze_Latn-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 515960, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'aze_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 517354, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'aze_Latn-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 529910, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'aze_Latn-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 520498, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'aze_Latn-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 515560, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'aze_Latn-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 554908, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'aze_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 535247, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'aze_Latn-uig_Arab': {'num_samples': 1997, 'number_of_characters': 580656, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'aze_Latn-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 563329, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'bak_Cyrl-aze_Latn': {'num_samples': 1997, 'number_of_characters': 515960, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'bak_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 494046, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bak_Cyrl-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 506602, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'bak_Cyrl-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 497190, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'bak_Cyrl-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 492252, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'bak_Cyrl-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 531600, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'bak_Cyrl-tur_Latn': {'num_samples': 1997, 'number_of_characters': 511939, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'bak_Cyrl-uig_Arab': {'num_samples': 1997, 'number_of_characters': 557348, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'bak_Cyrl-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 540021, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'bel_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 511000, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'bel_Cyrl-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 525979, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'bel_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 497408, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'bel_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 503810, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bel_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 512015, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'bel_Cyrl-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 523981, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'bel_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 533956, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'bel_Cyrl-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 530983, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'bel_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 509059, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'bel_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 508986, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'bel_Cyrl-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 508393, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'bel_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 512231, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'bel_Cyrl-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 518873, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'bem_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 546212, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bem_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 537470, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'bem_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 526972, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'bem_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 602279, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'bem_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 596231, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'bem_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 582774, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'bem_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 596822, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'bem_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 598248, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'ben_Beng-arb_Arab': {'num_samples': 1997, 'number_of_characters': 474983, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'ben_Beng-deu_Latn': {'num_samples': 1997, 'number_of_characters': 539452, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'ben_Beng-div_Thaa': {'num_samples': 1997, 'number_of_characters': 547650, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'ben_Beng-ell_Grek': {'num_samples': 1997, 'number_of_characters': 542929, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'ben_Beng-eng_Latn': {'num_samples': 1997, 'number_of_characters': 491522, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ben_Beng-eus_Latn': {'num_samples': 1997, 'number_of_characters': 519005, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'ben_Beng-fas_Arab': {'num_samples': 1997, 'number_of_characters': 487141, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'ben_Beng-fin_Latn': {'num_samples': 1997, 'number_of_characters': 513602, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'ben_Beng-fra_Latn': {'num_samples': 1997, 'number_of_characters': 536910, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'ben_Beng-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 488733, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'ben_Beng-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 444098, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'ben_Beng-hin_Deva': {'num_samples': 1997, 'number_of_characters': 505377, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'ben_Beng-hun_Latn': {'num_samples': 1997, 'number_of_characters': 522178, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'ben_Beng-ind_Latn': {'num_samples': 1997, 'number_of_characters': 530774, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'ben_Beng-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 355428, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'ben_Beng-kan_Knda': {'num_samples': 1997, 'number_of_characters': 509338, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'ben_Beng-kor_Hang': {'num_samples': 1997, 'number_of_characters': 377207, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'ben_Beng-lit_Latn': {'num_samples': 1997, 'number_of_characters': 503199, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'ben_Beng-mar_Deva': {'num_samples': 1997, 'number_of_characters': 504689, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'ben_Beng-nep_Deva': {'num_samples': 1997, 'number_of_characters': 492025, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'ben_Beng-nld_Latn': {'num_samples': 1997, 'number_of_characters': 535717, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'ben_Beng-pan_Guru': {'num_samples': 1997, 'number_of_characters': 494224, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'ben_Beng-pol_Latn': {'num_samples': 1997, 'number_of_characters': 521668, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ben_Beng-por_Latn': {'num_samples': 1997, 'number_of_characters': 521017, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'ben_Beng-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 518695, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'ben_Beng-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 502543, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'ben_Beng-snd_Arab': {'num_samples': 1997, 'number_of_characters': 464129, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'ben_Beng-spa_Latn': {'num_samples': 1997, 'number_of_characters': 532002, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'ben_Beng-swa_Latn': {'num_samples': 1997, 'number_of_characters': 516311, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'ben_Beng-swe_Latn': {'num_samples': 1997, 'number_of_characters': 495629, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'ben_Beng-tam_Taml': {'num_samples': 1997, 'number_of_characters': 553763, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'ben_Beng-tel_Telu': {'num_samples': 1997, 'number_of_characters': 491329, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'ben_Beng-tur_Latn': {'num_samples': 1997, 'number_of_characters': 509415, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'ben_Beng-urd_Arab': {'num_samples': 1997, 'number_of_characters': 491800, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'ben_Beng-vie_Latn': {'num_samples': 1997, 'number_of_characters': 514923, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'ben_Beng-zho_Hant': {'num_samples': 1997, 'number_of_characters': 335280, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'ben_Beng-zul_Latn': {'num_samples': 1997, 'number_of_characters': 501534, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'bod_Tibt-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 543850, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'bod_Tibt-eng_Latn': {'num_samples': 1997, 'number_of_characters': 548349, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bod_Tibt-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 589120, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'bod_Tibt-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 567609, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'bod_Tibt-mon_Mong': {'num_samples': 1997, 'number_of_characters': 559677, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'bod_Tibt-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 612483, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'bod_Tibt-tha_Thai': {'num_samples': 1997, 'number_of_characters': 538097, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'bos_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 511000, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'bos_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 524799, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'bos_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 496228, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'bos_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 502630, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bos_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 510835, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'bos_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 522801, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'bos_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 532776, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'bos_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 529803, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'bos_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 507879, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'bos_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 507806, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'bos_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 507213, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'bos_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 511051, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'bos_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 517693, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'bul_Cyrl-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 525979, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'bul_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 524799, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'bul_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 511207, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'bul_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 517609, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bul_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 525814, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'bul_Cyrl-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 537780, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'bul_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 547755, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'bul_Cyrl-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 544782, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'bul_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 522858, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'bul_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 522785, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'bul_Cyrl-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 522192, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'bul_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 526030, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'bul_Cyrl-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 532672, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'cat_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 530680, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'cat_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 576068, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'cat_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 554946, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'cat_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 572177, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'cat_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 560435, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'cat_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 560175, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'cat_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 575445, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'cat_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 571160, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'ces_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 497408, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'ces_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 496228, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'ces_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 511207, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'ces_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 489038, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ces_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 497243, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'ces_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 509209, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'ces_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 519184, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ces_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 516211, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'ces_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 494287, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'ces_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 494214, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'ces_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 493621, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'ces_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 497459, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'ces_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 504101, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'ckb_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 483548, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'ckb_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 500087, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ckb_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 495706, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'ckb_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 452663, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'ckb_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 498313, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'ckb_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 466202, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'ckb_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 494903, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'ckb_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 495000, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'ckb_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 467280, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'ckb_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 526514, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'cym_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 514225, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.45, 'max_sentence1_length': 444, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'cym_Latn-gle_Latn': {'num_samples': 1997, 'number_of_characters': 561314, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.45, 'max_sentence1_length': 444, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 147.63, 'max_sentence2_length': 461, 'unique_sentence2': 1997}, 'dan_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 520490, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'dan_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 547788, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'dan_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 499858, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'dan_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 509941, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'dan_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 514346, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'dan_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 532895, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'dan_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 544053, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'dan_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 500495, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'dan_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 503582, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'dan_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 503965, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'deu_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 564002, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'deu_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 526831, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'deu_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 539452, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'deu_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 547788, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'deu_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 594777, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'deu_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 543370, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'deu_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 553453, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'deu_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 538989, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'deu_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 565450, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'deu_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 588758, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'deu_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 495946, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'deu_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 557225, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'deu_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 574026, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'deu_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 582622, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'deu_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 557858, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'deu_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 407276, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'deu_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 429055, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'deu_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 555047, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'deu_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 576407, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'deu_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 587565, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'deu_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 544007, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'deu_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 547094, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'deu_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 573516, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'deu_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 572865, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'deu_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 570543, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'deu_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 583850, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'deu_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 568159, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'deu_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 547477, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'deu_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 605611, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'deu_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 561263, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'deu_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 566771, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'deu_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 387128, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'deu_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 553382, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'div_Thaa-ben_Beng': {'num_samples': 1997, 'number_of_characters': 547650, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'div_Thaa-eng_Latn': {'num_samples': 1997, 'number_of_characters': 551568, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'div_Thaa-eus_Latn': {'num_samples': 1997, 'number_of_characters': 579051, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'div_Thaa-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 548779, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'div_Thaa-hin_Deva': {'num_samples': 1997, 'number_of_characters': 565423, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'div_Thaa-kan_Knda': {'num_samples': 1997, 'number_of_characters': 569384, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'div_Thaa-mar_Deva': {'num_samples': 1997, 'number_of_characters': 564735, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'div_Thaa-nep_Deva': {'num_samples': 1997, 'number_of_characters': 552071, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'div_Thaa-pan_Guru': {'num_samples': 1997, 'number_of_characters': 554270, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'div_Thaa-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 562589, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'div_Thaa-snd_Arab': {'num_samples': 1997, 'number_of_characters': 524175, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'div_Thaa-tam_Taml': {'num_samples': 1997, 'number_of_characters': 613809, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'div_Thaa-tel_Telu': {'num_samples': 1997, 'number_of_characters': 551375, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'div_Thaa-urd_Arab': {'num_samples': 1997, 'number_of_characters': 551846, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'dzo_Tibt-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 543850, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'dzo_Tibt-eng_Latn': {'num_samples': 1997, 'number_of_characters': 490941, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'dzo_Tibt-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 531712, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'dzo_Tibt-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 510201, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'dzo_Tibt-mon_Mong': {'num_samples': 1997, 'number_of_characters': 502269, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'dzo_Tibt-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 555075, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'dzo_Tibt-tha_Thai': {'num_samples': 1997, 'number_of_characters': 480689, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'ell_Grek-arb_Arab': {'num_samples': 1997, 'number_of_characters': 530308, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'ell_Grek-ben_Beng': {'num_samples': 1997, 'number_of_characters': 542929, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'ell_Grek-deu_Latn': {'num_samples': 1997, 'number_of_characters': 594777, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'ell_Grek-eng_Latn': {'num_samples': 1997, 'number_of_characters': 546847, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ell_Grek-fas_Arab': {'num_samples': 1997, 'number_of_characters': 542466, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'ell_Grek-fin_Latn': {'num_samples': 1997, 'number_of_characters': 568927, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'ell_Grek-fra_Latn': {'num_samples': 1997, 'number_of_characters': 592235, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'ell_Grek-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 499423, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'ell_Grek-hin_Deva': {'num_samples': 1997, 'number_of_characters': 560702, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'ell_Grek-hun_Latn': {'num_samples': 1997, 'number_of_characters': 577503, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'ell_Grek-hye_Armn': {'num_samples': 1997, 'number_of_characters': 563842, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 132.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'ell_Grek-ind_Latn': {'num_samples': 1997, 'number_of_characters': 586099, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'ell_Grek-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 410753, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'ell_Grek-kat_Geor': {'num_samples': 1997, 'number_of_characters': 565719, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 133.5, 'max_sentence2_length': 503, 'unique_sentence2': 1995}, 'ell_Grek-kor_Hang': {'num_samples': 1997, 'number_of_characters': 432532, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'ell_Grek-lit_Latn': {'num_samples': 1997, 'number_of_characters': 558524, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'ell_Grek-nld_Latn': {'num_samples': 1997, 'number_of_characters': 591042, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'ell_Grek-pol_Latn': {'num_samples': 1997, 'number_of_characters': 576993, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ell_Grek-por_Latn': {'num_samples': 1997, 'number_of_characters': 576342, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'ell_Grek-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 574020, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'ell_Grek-spa_Latn': {'num_samples': 1997, 'number_of_characters': 587327, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'ell_Grek-sqi_Latn': {'num_samples': 1997, 'number_of_characters': 582734, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 142.02, 'max_sentence2_length': 461, 'unique_sentence2': 1996}, 'ell_Grek-swa_Latn': {'num_samples': 1997, 'number_of_characters': 571636, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'ell_Grek-swe_Latn': {'num_samples': 1997, 'number_of_characters': 550954, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'ell_Grek-tam_Taml': {'num_samples': 1997, 'number_of_characters': 609088, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'ell_Grek-tur_Latn': {'num_samples': 1997, 'number_of_characters': 564740, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'ell_Grek-vie_Latn': {'num_samples': 1997, 'number_of_characters': 570248, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'ell_Grek-zho_Hant': {'num_samples': 1997, 'number_of_characters': 390605, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'ell_Grek-zul_Latn': {'num_samples': 1997, 'number_of_characters': 556859, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'eng_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 516072, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'eng_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 415227, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'eng_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 478901, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'eng_Latn-aze_Latn': {'num_samples': 1997, 'number_of_characters': 517354, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'eng_Latn-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 494046, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'eng_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 503810, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'eng_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 546212, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'eng_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 491522, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'eng_Latn-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 548349, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'eng_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 502630, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'eng_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 517609, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'eng_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 530680, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'eng_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 489038, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'eng_Latn-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 500087, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'eng_Latn-cym_Latn': {'num_samples': 1997, 'number_of_characters': 514225, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.45, 'max_sentence2_length': 444, 'unique_sentence2': 1997}, 'eng_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 499858, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'eng_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 543370, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'eng_Latn-div_Thaa': {'num_samples': 1997, 'number_of_characters': 551568, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'eng_Latn-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 490941, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'eng_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 546847, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'eng_Latn-eus_Latn': {'num_samples': 1997, 'number_of_characters': 522923, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'eng_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 486698, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'eng_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 505523, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'eng_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 491059, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'eng_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 548225, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'eng_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 541140, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'eng_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 517520, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'eng_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 540828, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'eng_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 476200, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'eng_Latn-gle_Latn': {'num_samples': 1997, 'number_of_characters': 542529, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 147.63, 'max_sentence2_length': 461, 'unique_sentence2': 1997}, 'eng_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 519706, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'eng_Latn-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 492651, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'eng_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 517686, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'eng_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 448016, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'eng_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 509295, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'eng_Latn-hmn_Latn': {'num_samples': 1997, 'number_of_characters': 578510, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 165.64, 'max_sentence2_length': 643, 'unique_sentence2': 1997}, 'eng_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 503645, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'eng_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 526096, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'eng_Latn-hye_Armn': {'num_samples': 1997, 'number_of_characters': 512435, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 132.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'eng_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 493821, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'eng_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 534692, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'eng_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 509928, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'eng_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 536937, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'eng_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 359346, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'eng_Latn-kan_Knda': {'num_samples': 1997, 'number_of_characters': 513256, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'eng_Latn-kat_Geor': {'num_samples': 1997, 'number_of_characters': 514312, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 133.5, 'max_sentence2_length': 503, 'unique_sentence2': 1995}, 'eng_Latn-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 507996, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'eng_Latn-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 536211, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'eng_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 551507, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'eng_Latn-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 498584, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'eng_Latn-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 493666, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'eng_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 381125, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'eng_Latn-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 514700, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'eng_Latn-lav_Latn': {'num_samples': 1997, 'number_of_characters': 515908, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 134.3, 'max_sentence2_length': 503, 'unique_sentence2': 1994}, 'eng_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 507117, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'eng_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 528477, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'eng_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 551872, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'eng_Latn-mar_Deva': {'num_samples': 1997, 'number_of_characters': 508607, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'eng_Latn-mey_Arab': {'num_samples': 1997, 'number_of_characters': 461555, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'eng_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 515611, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'eng_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 568028, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'eng_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 525195, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'eng_Latn-mon_Mong': {'num_samples': 1997, 'number_of_characters': 506768, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'eng_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 521844, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'eng_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 524903, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'eng_Latn-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 559574, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'eng_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 545459, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'eng_Latn-nep_Deva': {'num_samples': 1997, 'number_of_characters': 495943, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'eng_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 539635, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'eng_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 496077, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'eng_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 499164, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'eng_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 539219, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'eng_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 532002, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'eng_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 485151, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'eng_Latn-pan_Guru': {'num_samples': 1997, 'number_of_characters': 498142, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'eng_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 525586, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'eng_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 524935, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'eng_Latn-prs_Arab': {'num_samples': 1997, 'number_of_characters': 490256, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'eng_Latn-pus_Arab': {'num_samples': 1997, 'number_of_characters': 490353, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'eng_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 540205, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'eng_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 522613, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'eng_Latn-shi_Arab': {'num_samples': 1997, 'number_of_characters': 462633, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'eng_Latn-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 506461, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'eng_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 500689, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'eng_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 500616, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'eng_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 525575, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'eng_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 546050, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'eng_Latn-snd_Arab': {'num_samples': 1997, 'number_of_characters': 468047, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'eng_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 539012, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'eng_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 535920, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'eng_Latn-sqi_Latn': {'num_samples': 1997, 'number_of_characters': 531327, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 142.02, 'max_sentence2_length': 461, 'unique_sentence2': 1996}, 'eng_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 500023, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'eng_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 503861, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'eng_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 535862, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'eng_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 520229, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'eng_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 499547, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'eng_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 557343, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'eng_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 557681, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'eng_Latn-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 493646, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'eng_Latn-tel_Telu': {'num_samples': 1997, 'number_of_characters': 495247, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'eng_Latn-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 521867, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'eng_Latn-tha_Thai': {'num_samples': 1997, 'number_of_characters': 485188, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'eng_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 412958, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'eng_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 561360, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'eng_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 582003, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'eng_Latn-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 532994, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'eng_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 513333, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'eng_Latn-uig_Arab': {'num_samples': 1997, 'number_of_characters': 558742, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'eng_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 510503, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'eng_Latn-urd_Arab': {'num_samples': 1997, 'number_of_characters': 495718, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'eng_Latn-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 541415, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'eng_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 547476, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'eng_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 518841, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'eng_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 487523, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'eng_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 515810, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'eng_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 563808, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'eng_Latn-yue_Hant': {'num_samples': 1997, 'number_of_characters': 326607, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'eng_Latn-zho_Hans': {'num_samples': 1997, 'number_of_characters': 332681, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'eng_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 339198, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'eng_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 505452, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'eus_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 519005, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'eus_Latn-div_Thaa': {'num_samples': 1997, 'number_of_characters': 579051, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'eus_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 522923, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'eus_Latn-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 520134, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'eus_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 536778, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'eus_Latn-kan_Knda': {'num_samples': 1997, 'number_of_characters': 540739, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'eus_Latn-mar_Deva': {'num_samples': 1997, 'number_of_characters': 536090, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'eus_Latn-nep_Deva': {'num_samples': 1997, 'number_of_characters': 523426, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'eus_Latn-pan_Guru': {'num_samples': 1997, 'number_of_characters': 525625, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'eus_Latn-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 533944, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'eus_Latn-snd_Arab': {'num_samples': 1997, 'number_of_characters': 495530, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'eus_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 585164, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'eus_Latn-tel_Telu': {'num_samples': 1997, 'number_of_characters': 522730, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'eus_Latn-urd_Arab': {'num_samples': 1997, 'number_of_characters': 523201, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'ewe_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 537470, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'ewe_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 486698, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ewe_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 467458, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'ewe_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 542765, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'ewe_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 536717, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'ewe_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 523260, 'unique_pairs': 1995, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'ewe_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 537308, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'ewe_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 538734, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'fao_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 526155, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'fao_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 509941, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'fao_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 553453, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'fao_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 505523, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fao_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 520011, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'fao_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 538560, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'fao_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 549718, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'fao_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 506160, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'fao_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 509247, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'fao_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 509630, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'fas_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 474520, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'fas_Arab-ben_Beng': {'num_samples': 1997, 'number_of_characters': 487141, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'fas_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 495706, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'fas_Arab-deu_Latn': {'num_samples': 1997, 'number_of_characters': 538989, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'fas_Arab-ell_Grek': {'num_samples': 1997, 'number_of_characters': 542466, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'fas_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 491059, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fas_Arab-fin_Latn': {'num_samples': 1997, 'number_of_characters': 513139, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'fas_Arab-fra_Latn': {'num_samples': 1997, 'number_of_characters': 536447, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'fas_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 443635, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'fas_Arab-hin_Deva': {'num_samples': 1997, 'number_of_characters': 504914, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'fas_Arab-hun_Latn': {'num_samples': 1997, 'number_of_characters': 521715, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'fas_Arab-ind_Latn': {'num_samples': 1997, 'number_of_characters': 530311, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'fas_Arab-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 354965, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'fas_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 489285, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'fas_Arab-kor_Hang': {'num_samples': 1997, 'number_of_characters': 376744, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'fas_Arab-lit_Latn': {'num_samples': 1997, 'number_of_characters': 502736, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'fas_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 457174, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'fas_Arab-nld_Latn': {'num_samples': 1997, 'number_of_characters': 535254, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'fas_Arab-pol_Latn': {'num_samples': 1997, 'number_of_characters': 521205, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'fas_Arab-por_Latn': {'num_samples': 1997, 'number_of_characters': 520554, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'fas_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 485875, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'fas_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 485972, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'fas_Arab-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 518232, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'fas_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 458252, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'fas_Arab-spa_Latn': {'num_samples': 1997, 'number_of_characters': 531539, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'fas_Arab-swa_Latn': {'num_samples': 1997, 'number_of_characters': 515848, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'fas_Arab-swe_Latn': {'num_samples': 1997, 'number_of_characters': 495166, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'fas_Arab-tam_Taml': {'num_samples': 1997, 'number_of_characters': 553300, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'fas_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 517486, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'fas_Arab-tur_Latn': {'num_samples': 1997, 'number_of_characters': 508952, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'fas_Arab-vie_Latn': {'num_samples': 1997, 'number_of_characters': 514460, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'fas_Arab-zho_Hant': {'num_samples': 1997, 'number_of_characters': 334817, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'fas_Arab-zul_Latn': {'num_samples': 1997, 'number_of_characters': 501071, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'fij_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 548225, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fij_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 593925, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'fij_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 587477, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'fij_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 604657, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'fij_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 620813, 'unique_pairs': 1995, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'fij_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 574629, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'fij_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 577688, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'fij_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 578360, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'fij_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 610128, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'fij_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 614145, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'fil_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 541140, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fil_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 593925, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'fil_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 580392, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'fil_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 597572, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'fil_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 613728, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'fil_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 567544, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'fil_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 570603, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'fil_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 571275, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'fil_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 603043, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'fil_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 607060, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'fin_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 500981, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'fin_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 513602, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'fin_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 565450, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'fin_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 568927, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'fin_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 517520, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fin_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 513139, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'fin_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 562908, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'fin_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 470096, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'fin_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 531375, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'fin_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 548176, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'fin_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 556772, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'fin_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 381426, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'fin_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 403205, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'fin_Latn-lav_Latn': {'num_samples': 1997, 'number_of_characters': 537988, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.3, 'max_sentence2_length': 503, 'unique_sentence2': 1994}, 'fin_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 529197, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'fin_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 561715, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'fin_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 547666, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'fin_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 547015, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'fin_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 544693, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'fin_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 558000, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'fin_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 542309, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'fin_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 521627, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'fin_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 579761, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'fin_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 535413, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'fin_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 540921, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'fin_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 361278, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'fin_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 527532, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'fra_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 524289, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'fra_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 536910, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'fra_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 576068, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'fra_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 588758, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'fra_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 592235, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'fra_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 540828, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fra_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 536447, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'fra_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 562908, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'fra_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 565094, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'fra_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 493404, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'fra_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 554683, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'fra_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 571484, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'fra_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 580080, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'fra_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 582325, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'fra_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 404734, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'fra_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 426513, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'fra_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 552505, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'fra_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 570583, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'fra_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 585023, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'fra_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 570974, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'fra_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 570323, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'fra_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 585593, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'fra_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 568001, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'fra_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 581308, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'fra_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 565617, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'fra_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 544935, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'fra_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 603069, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'fra_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 558721, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'fra_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 564229, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'fra_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 384586, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'fra_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 550840, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'fuc_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 526972, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'fuc_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 476200, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fuc_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 467458, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'fuc_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 532267, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'fuc_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 526219, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'fuc_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 512762, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'fuc_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 526810, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'fuc_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 528236, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'gle_Latn-cym_Latn': {'num_samples': 1997, 'number_of_characters': 561314, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 147.63, 'max_sentence1_length': 461, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.45, 'max_sentence2_length': 444, 'unique_sentence2': 1997}, 'gle_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 542529, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 147.63, 'max_sentence1_length': 461, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'glg_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 554946, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'glg_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 519706, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'glg_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 565094, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'glg_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 561203, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'glg_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 549461, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'glg_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 549201, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'glg_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 564471, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'glg_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 560186, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'guj_Gujr-ben_Beng': {'num_samples': 1997, 'number_of_characters': 488733, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'guj_Gujr-div_Thaa': {'num_samples': 1997, 'number_of_characters': 548779, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'guj_Gujr-eng_Latn': {'num_samples': 1997, 'number_of_characters': 492651, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'guj_Gujr-eus_Latn': {'num_samples': 1997, 'number_of_characters': 520134, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'guj_Gujr-hin_Deva': {'num_samples': 1997, 'number_of_characters': 506506, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'guj_Gujr-kan_Knda': {'num_samples': 1997, 'number_of_characters': 510467, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'guj_Gujr-mar_Deva': {'num_samples': 1997, 'number_of_characters': 505818, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'guj_Gujr-nep_Deva': {'num_samples': 1997, 'number_of_characters': 493154, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'guj_Gujr-pan_Guru': {'num_samples': 1997, 'number_of_characters': 495353, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'guj_Gujr-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 503672, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'guj_Gujr-snd_Arab': {'num_samples': 1997, 'number_of_characters': 465258, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'guj_Gujr-tam_Taml': {'num_samples': 1997, 'number_of_characters': 554892, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'guj_Gujr-tel_Telu': {'num_samples': 1997, 'number_of_characters': 492458, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'guj_Gujr-urd_Arab': {'num_samples': 1997, 'number_of_characters': 492929, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'hau_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 437473, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'hau_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 517686, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hau_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 516067, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'hau_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 561465, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'hau_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 507397, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'hau_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 561258, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'hau_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 558108, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'hau_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 542475, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'hau_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 435204, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'hau_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 604249, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'hau_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 509769, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'hau_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 538056, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'hau_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 586054, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'hau_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 527698, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'heb_Hebr-arb_Arab': {'num_samples': 1997, 'number_of_characters': 431477, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'heb_Hebr-ben_Beng': {'num_samples': 1997, 'number_of_characters': 444098, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'heb_Hebr-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 452663, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'heb_Hebr-deu_Latn': {'num_samples': 1997, 'number_of_characters': 495946, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'heb_Hebr-ell_Grek': {'num_samples': 1997, 'number_of_characters': 499423, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'heb_Hebr-eng_Latn': {'num_samples': 1997, 'number_of_characters': 448016, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'heb_Hebr-fas_Arab': {'num_samples': 1997, 'number_of_characters': 443635, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'heb_Hebr-fin_Latn': {'num_samples': 1997, 'number_of_characters': 470096, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'heb_Hebr-fra_Latn': {'num_samples': 1997, 'number_of_characters': 493404, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'heb_Hebr-hin_Deva': {'num_samples': 1997, 'number_of_characters': 461871, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'heb_Hebr-hun_Latn': {'num_samples': 1997, 'number_of_characters': 478672, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'heb_Hebr-ind_Latn': {'num_samples': 1997, 'number_of_characters': 487268, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'heb_Hebr-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 311922, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'heb_Hebr-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 446242, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'heb_Hebr-kor_Hang': {'num_samples': 1997, 'number_of_characters': 333701, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'heb_Hebr-lit_Latn': {'num_samples': 1997, 'number_of_characters': 459693, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'heb_Hebr-mey_Arab': {'num_samples': 1997, 'number_of_characters': 414131, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'heb_Hebr-nld_Latn': {'num_samples': 1997, 'number_of_characters': 492211, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'heb_Hebr-pol_Latn': {'num_samples': 1997, 'number_of_characters': 478162, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'heb_Hebr-por_Latn': {'num_samples': 1997, 'number_of_characters': 477511, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'heb_Hebr-prs_Arab': {'num_samples': 1997, 'number_of_characters': 442832, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'heb_Hebr-pus_Arab': {'num_samples': 1997, 'number_of_characters': 442929, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'heb_Hebr-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 475189, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'heb_Hebr-shi_Arab': {'num_samples': 1997, 'number_of_characters': 415209, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'heb_Hebr-spa_Latn': {'num_samples': 1997, 'number_of_characters': 488496, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'heb_Hebr-swa_Latn': {'num_samples': 1997, 'number_of_characters': 472805, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'heb_Hebr-swe_Latn': {'num_samples': 1997, 'number_of_characters': 452123, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'heb_Hebr-tam_Taml': {'num_samples': 1997, 'number_of_characters': 510257, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'heb_Hebr-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 474443, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'heb_Hebr-tur_Latn': {'num_samples': 1997, 'number_of_characters': 465909, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'heb_Hebr-vie_Latn': {'num_samples': 1997, 'number_of_characters': 471417, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'heb_Hebr-zho_Hant': {'num_samples': 1997, 'number_of_characters': 291774, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'heb_Hebr-zul_Latn': {'num_samples': 1997, 'number_of_characters': 458028, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'hin_Deva-arb_Arab': {'num_samples': 1997, 'number_of_characters': 492756, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'hin_Deva-ben_Beng': {'num_samples': 1997, 'number_of_characters': 505377, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'hin_Deva-deu_Latn': {'num_samples': 1997, 'number_of_characters': 557225, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'hin_Deva-div_Thaa': {'num_samples': 1997, 'number_of_characters': 565423, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'hin_Deva-ell_Grek': {'num_samples': 1997, 'number_of_characters': 560702, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'hin_Deva-eng_Latn': {'num_samples': 1997, 'number_of_characters': 509295, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hin_Deva-eus_Latn': {'num_samples': 1997, 'number_of_characters': 536778, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'hin_Deva-fas_Arab': {'num_samples': 1997, 'number_of_characters': 504914, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'hin_Deva-fin_Latn': {'num_samples': 1997, 'number_of_characters': 531375, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'hin_Deva-fra_Latn': {'num_samples': 1997, 'number_of_characters': 554683, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'hin_Deva-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 506506, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'hin_Deva-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 461871, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'hin_Deva-hun_Latn': {'num_samples': 1997, 'number_of_characters': 539951, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'hin_Deva-ind_Latn': {'num_samples': 1997, 'number_of_characters': 548547, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'hin_Deva-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 373201, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'hin_Deva-kan_Knda': {'num_samples': 1997, 'number_of_characters': 527111, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'hin_Deva-kor_Hang': {'num_samples': 1997, 'number_of_characters': 394980, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'hin_Deva-lit_Latn': {'num_samples': 1997, 'number_of_characters': 520972, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'hin_Deva-mar_Deva': {'num_samples': 1997, 'number_of_characters': 522462, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'hin_Deva-nep_Deva': {'num_samples': 1997, 'number_of_characters': 509798, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'hin_Deva-nld_Latn': {'num_samples': 1997, 'number_of_characters': 553490, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'hin_Deva-pan_Guru': {'num_samples': 1997, 'number_of_characters': 511997, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'hin_Deva-pol_Latn': {'num_samples': 1997, 'number_of_characters': 539441, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'hin_Deva-por_Latn': {'num_samples': 1997, 'number_of_characters': 538790, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'hin_Deva-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 536468, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'hin_Deva-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 520316, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'hin_Deva-snd_Arab': {'num_samples': 1997, 'number_of_characters': 481902, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'hin_Deva-spa_Latn': {'num_samples': 1997, 'number_of_characters': 549775, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'hin_Deva-swa_Latn': {'num_samples': 1997, 'number_of_characters': 534084, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'hin_Deva-swe_Latn': {'num_samples': 1997, 'number_of_characters': 513402, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'hin_Deva-tam_Taml': {'num_samples': 1997, 'number_of_characters': 571536, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'hin_Deva-tel_Telu': {'num_samples': 1997, 'number_of_characters': 509102, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'hin_Deva-tur_Latn': {'num_samples': 1997, 'number_of_characters': 527188, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'hin_Deva-urd_Arab': {'num_samples': 1997, 'number_of_characters': 509573, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'hin_Deva-vie_Latn': {'num_samples': 1997, 'number_of_characters': 532696, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'hin_Deva-zho_Hant': {'num_samples': 1997, 'number_of_characters': 353053, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'hin_Deva-zul_Latn': {'num_samples': 1997, 'number_of_characters': 519307, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'hmn_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 578510, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 165.64, 'max_sentence1_length': 643, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hrv_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 512015, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'hrv_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 510835, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'hrv_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 525814, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'hrv_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 497243, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'hrv_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 503645, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hrv_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 523816, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'hrv_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 533791, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'hrv_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 530818, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'hrv_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 508894, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'hrv_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 508821, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'hrv_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 508228, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'hrv_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 512066, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'hrv_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 518708, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'hun_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 509557, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'hun_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 522178, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'hun_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 574026, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'hun_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 577503, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'hun_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 526096, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hun_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 521715, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'hun_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 548176, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'hun_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 571484, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'hun_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 478672, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'hun_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 539951, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'hun_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 565348, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'hun_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 390002, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'hun_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 411781, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'hun_Latn-lav_Latn': {'num_samples': 1997, 'number_of_characters': 546564, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 134.3, 'max_sentence2_length': 503, 'unique_sentence2': 1994}, 'hun_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 537773, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'hun_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 570291, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'hun_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 556242, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'hun_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 555591, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'hun_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 553269, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'hun_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 566576, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'hun_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 550885, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'hun_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 530203, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'hun_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 588337, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'hun_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 543989, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'hun_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 549497, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'hun_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 369854, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'hun_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 536108, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'hye_Armn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 563842, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 132.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'hye_Armn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 512435, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 132.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hye_Armn-kat_Geor': {'num_samples': 1997, 'number_of_characters': 531307, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 132.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 133.5, 'max_sentence2_length': 503, 'unique_sentence2': 1995}, 'hye_Armn-sqi_Latn': {'num_samples': 1997, 'number_of_characters': 548322, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 132.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 142.02, 'max_sentence2_length': 461, 'unique_sentence2': 1996}, 'ibo_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 413608, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'ibo_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 493821, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ibo_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 516067, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'ibo_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 537600, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'ibo_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 483532, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'ibo_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 537393, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'ibo_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 534243, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'ibo_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 518610, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'ibo_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 411339, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'ibo_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 580384, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'ibo_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 485904, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'ibo_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 514191, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'ibo_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 562189, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'ibo_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 503833, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'ind_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 518153, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'ind_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 530774, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'ind_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 582622, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'ind_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 586099, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'ind_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 534692, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ind_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 530311, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'ind_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 587477, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'ind_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 580392, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'ind_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 556772, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'ind_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 580080, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'ind_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 487268, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'ind_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 548547, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'ind_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 565348, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'ind_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 398598, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'ind_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 420377, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'ind_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 546369, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'ind_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 591124, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'ind_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 607280, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'ind_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 561096, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'ind_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 564155, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'ind_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 578887, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'ind_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 564838, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ind_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 564187, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'ind_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 561865, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'ind_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 564827, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'ind_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 575172, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'ind_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 559481, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'ind_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 538799, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'ind_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 596595, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'ind_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 596933, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'ind_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 600612, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'ind_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 552585, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'ind_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 558093, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'ind_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 378450, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'ind_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 544704, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'isl_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 530560, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'isl_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 514346, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'isl_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 557858, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'isl_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 509928, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'isl_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 520011, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'isl_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 542965, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'isl_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 554123, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'isl_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 510565, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'isl_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 513652, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'isl_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 514035, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'ita_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 572177, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'ita_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 536937, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ita_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 582325, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'ita_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 561203, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ita_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 566692, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'ita_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 566432, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'ita_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 581702, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'ita_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 577417, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'jpn_Jpan-arb_Arab': {'num_samples': 1997, 'number_of_characters': 342807, 'unique_pairs': 1995, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'jpn_Jpan-ben_Beng': {'num_samples': 1997, 'number_of_characters': 355428, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'jpn_Jpan-deu_Latn': {'num_samples': 1997, 'number_of_characters': 407276, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'jpn_Jpan-ell_Grek': {'num_samples': 1997, 'number_of_characters': 410753, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'jpn_Jpan-eng_Latn': {'num_samples': 1997, 'number_of_characters': 359346, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'jpn_Jpan-fas_Arab': {'num_samples': 1997, 'number_of_characters': 354965, 'unique_pairs': 1995, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'jpn_Jpan-fin_Latn': {'num_samples': 1997, 'number_of_characters': 381426, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'jpn_Jpan-fra_Latn': {'num_samples': 1997, 'number_of_characters': 404734, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'jpn_Jpan-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 311922, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'jpn_Jpan-hin_Deva': {'num_samples': 1997, 'number_of_characters': 373201, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'jpn_Jpan-hun_Latn': {'num_samples': 1997, 'number_of_characters': 390002, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'jpn_Jpan-ind_Latn': {'num_samples': 1997, 'number_of_characters': 398598, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'jpn_Jpan-kor_Hang': {'num_samples': 1997, 'number_of_characters': 245031, 'unique_pairs': 1995, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'jpn_Jpan-lit_Latn': {'num_samples': 1997, 'number_of_characters': 371023, 'unique_pairs': 1995, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'jpn_Jpan-nld_Latn': {'num_samples': 1997, 'number_of_characters': 403541, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'jpn_Jpan-pol_Latn': {'num_samples': 1997, 'number_of_characters': 389492, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'jpn_Jpan-por_Latn': {'num_samples': 1997, 'number_of_characters': 388841, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'jpn_Jpan-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 386519, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'jpn_Jpan-spa_Latn': {'num_samples': 1997, 'number_of_characters': 399826, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'jpn_Jpan-swa_Latn': {'num_samples': 1997, 'number_of_characters': 384135, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'jpn_Jpan-swe_Latn': {'num_samples': 1997, 'number_of_characters': 363453, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'jpn_Jpan-tam_Taml': {'num_samples': 1997, 'number_of_characters': 421587, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'jpn_Jpan-tur_Latn': {'num_samples': 1997, 'number_of_characters': 377239, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'jpn_Jpan-vie_Latn': {'num_samples': 1997, 'number_of_characters': 382747, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'jpn_Jpan-yue_Hant': {'num_samples': 1997, 'number_of_characters': 190513, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'jpn_Jpan-zho_Hans': {'num_samples': 1997, 'number_of_characters': 196587, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'jpn_Jpan-zho_Hant': {'num_samples': 1997, 'number_of_characters': 203104, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'jpn_Jpan-zul_Latn': {'num_samples': 1997, 'number_of_characters': 369358, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'kan_Knda-ben_Beng': {'num_samples': 1997, 'number_of_characters': 509338, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'kan_Knda-div_Thaa': {'num_samples': 1997, 'number_of_characters': 569384, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'kan_Knda-eng_Latn': {'num_samples': 1997, 'number_of_characters': 513256, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kan_Knda-eus_Latn': {'num_samples': 1997, 'number_of_characters': 540739, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'kan_Knda-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 510467, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'kan_Knda-hin_Deva': {'num_samples': 1997, 'number_of_characters': 527111, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'kan_Knda-mar_Deva': {'num_samples': 1997, 'number_of_characters': 526423, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'kan_Knda-nep_Deva': {'num_samples': 1997, 'number_of_characters': 513759, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'kan_Knda-pan_Guru': {'num_samples': 1997, 'number_of_characters': 515958, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'kan_Knda-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 524277, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'kan_Knda-snd_Arab': {'num_samples': 1997, 'number_of_characters': 485863, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'kan_Knda-tam_Taml': {'num_samples': 1997, 'number_of_characters': 575497, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'kan_Knda-tel_Telu': {'num_samples': 1997, 'number_of_characters': 513063, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'kan_Knda-urd_Arab': {'num_samples': 1997, 'number_of_characters': 513534, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'kat_Geor-ell_Grek': {'num_samples': 1997, 'number_of_characters': 565719, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 133.5, 'max_sentence1_length': 503, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'kat_Geor-eng_Latn': {'num_samples': 1997, 'number_of_characters': 514312, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 133.5, 'max_sentence1_length': 503, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kat_Geor-hye_Armn': {'num_samples': 1997, 'number_of_characters': 531307, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 133.5, 'max_sentence1_length': 503, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 132.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'kat_Geor-sqi_Latn': {'num_samples': 1997, 'number_of_characters': 550199, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 133.5, 'max_sentence1_length': 503, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 142.02, 'max_sentence2_length': 461, 'unique_sentence2': 1996}, 'kaz_Cyrl-aze_Latn': {'num_samples': 1997, 'number_of_characters': 529910, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'kaz_Cyrl-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 506602, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'kaz_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 507996, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kaz_Cyrl-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 511140, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'kaz_Cyrl-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 506202, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'kaz_Cyrl-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 545550, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'kaz_Cyrl-tur_Latn': {'num_samples': 1997, 'number_of_characters': 525889, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'kaz_Cyrl-uig_Arab': {'num_samples': 1997, 'number_of_characters': 571298, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'kaz_Cyrl-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 553971, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'khm_Khmr-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 589120, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'khm_Khmr-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 531712, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'khm_Khmr-eng_Latn': {'num_samples': 1997, 'number_of_characters': 536211, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'khm_Khmr-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 555471, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'khm_Khmr-mon_Mong': {'num_samples': 1997, 'number_of_characters': 547539, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'khm_Khmr-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 600345, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'khm_Khmr-tha_Thai': {'num_samples': 1997, 'number_of_characters': 525959, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'kin_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 602279, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'kin_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 551507, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kin_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 542765, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'kin_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 532267, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'kin_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 601526, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'kin_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 588069, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'kin_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 602117, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'kin_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 603543, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'kir_Cyrl-aze_Latn': {'num_samples': 1997, 'number_of_characters': 520498, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'kir_Cyrl-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 497190, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'kir_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 498584, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kir_Cyrl-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 511140, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'kir_Cyrl-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 496790, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'kir_Cyrl-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 536138, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'kir_Cyrl-tur_Latn': {'num_samples': 1997, 'number_of_characters': 516477, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'kir_Cyrl-uig_Arab': {'num_samples': 1997, 'number_of_characters': 561886, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'kir_Cyrl-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 544559, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'kmr_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 477127, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'kmr_Latn-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 498313, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'kmr_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 493666, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kmr_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 489285, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'kmr_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 446242, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'kmr_Latn-mey_Arab': {'num_samples': 1997, 'number_of_characters': 459781, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'kmr_Latn-prs_Arab': {'num_samples': 1997, 'number_of_characters': 488482, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'kmr_Latn-pus_Arab': {'num_samples': 1997, 'number_of_characters': 488579, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'kmr_Latn-shi_Arab': {'num_samples': 1997, 'number_of_characters': 460859, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'kmr_Latn-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 520093, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'kor_Hang-arb_Arab': {'num_samples': 1997, 'number_of_characters': 364586, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'kor_Hang-ben_Beng': {'num_samples': 1997, 'number_of_characters': 377207, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'kor_Hang-deu_Latn': {'num_samples': 1997, 'number_of_characters': 429055, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'kor_Hang-ell_Grek': {'num_samples': 1997, 'number_of_characters': 432532, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'kor_Hang-eng_Latn': {'num_samples': 1997, 'number_of_characters': 381125, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kor_Hang-fas_Arab': {'num_samples': 1997, 'number_of_characters': 376744, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'kor_Hang-fin_Latn': {'num_samples': 1997, 'number_of_characters': 403205, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'kor_Hang-fra_Latn': {'num_samples': 1997, 'number_of_characters': 426513, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'kor_Hang-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 333701, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'kor_Hang-hin_Deva': {'num_samples': 1997, 'number_of_characters': 394980, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'kor_Hang-hun_Latn': {'num_samples': 1997, 'number_of_characters': 411781, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'kor_Hang-ind_Latn': {'num_samples': 1997, 'number_of_characters': 420377, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'kor_Hang-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 245031, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'kor_Hang-lit_Latn': {'num_samples': 1997, 'number_of_characters': 392802, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'kor_Hang-nld_Latn': {'num_samples': 1997, 'number_of_characters': 425320, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'kor_Hang-pol_Latn': {'num_samples': 1997, 'number_of_characters': 411271, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'kor_Hang-por_Latn': {'num_samples': 1997, 'number_of_characters': 410620, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'kor_Hang-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 408298, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'kor_Hang-spa_Latn': {'num_samples': 1997, 'number_of_characters': 421605, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'kor_Hang-swa_Latn': {'num_samples': 1997, 'number_of_characters': 405914, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'kor_Hang-swe_Latn': {'num_samples': 1997, 'number_of_characters': 385232, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'kor_Hang-tam_Taml': {'num_samples': 1997, 'number_of_characters': 443366, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'kor_Hang-tur_Latn': {'num_samples': 1997, 'number_of_characters': 399018, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'kor_Hang-vie_Latn': {'num_samples': 1997, 'number_of_characters': 404526, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'kor_Hang-yue_Hant': {'num_samples': 1997, 'number_of_characters': 212292, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'kor_Hang-zho_Hans': {'num_samples': 1997, 'number_of_characters': 218366, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'kor_Hang-zho_Hant': {'num_samples': 1997, 'number_of_characters': 224883, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'kor_Hang-zul_Latn': {'num_samples': 1997, 'number_of_characters': 391137, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'lao_Laoo-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 567609, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'lao_Laoo-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 510201, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'lao_Laoo-eng_Latn': {'num_samples': 1997, 'number_of_characters': 514700, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'lao_Laoo-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 555471, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'lao_Laoo-mon_Mong': {'num_samples': 1997, 'number_of_characters': 526028, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'lao_Laoo-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 578834, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'lao_Laoo-tha_Thai': {'num_samples': 1997, 'number_of_characters': 504448, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'lav_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 515908, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 134.3, 'max_sentence1_length': 503, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'lav_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 537988, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.3, 'max_sentence1_length': 503, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'lav_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 546564, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 134.3, 'max_sentence1_length': 503, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'lav_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 527585, 'unique_pairs': 1995, 'min_sentence1_length': 7, 'average_sentence1_length': 134.3, 'max_sentence1_length': 503, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'lit_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 490578, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'lit_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 503199, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'lit_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 555047, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'lit_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 558524, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'lit_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 507117, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'lit_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 502736, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'lit_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 529197, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'lit_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 552505, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'lit_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 459693, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'lit_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 520972, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'lit_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 537773, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'lit_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 546369, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'lit_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 371023, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'lit_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 392802, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'lit_Latn-lav_Latn': {'num_samples': 1997, 'number_of_characters': 527585, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 134.3, 'max_sentence2_length': 503, 'unique_sentence2': 1994}, 'lit_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 551312, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'lit_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 537263, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'lit_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 536612, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'lit_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 534290, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'lit_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 547597, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'lit_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 531906, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'lit_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 511224, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'lit_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 569358, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'lit_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 525010, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'lit_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 530518, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'lit_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 350875, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'lit_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 517129, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'ltz_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 549109, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'ltz_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 532895, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'ltz_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 576407, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'ltz_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 528477, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ltz_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 538560, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'ltz_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 542965, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'ltz_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 572672, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'ltz_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 529114, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'ltz_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 532201, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'ltz_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 532584, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'mal_Mlym-eng_Latn': {'num_samples': 1997, 'number_of_characters': 551872, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mal_Mlym-fij_Latn': {'num_samples': 1997, 'number_of_characters': 604657, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'mal_Mlym-fil_Latn': {'num_samples': 1997, 'number_of_characters': 597572, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'mal_Mlym-ind_Latn': {'num_samples': 1997, 'number_of_characters': 591124, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'mal_Mlym-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 624460, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'mal_Mlym-mri_Latn': {'num_samples': 1997, 'number_of_characters': 578276, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'mal_Mlym-msa_Latn': {'num_samples': 1997, 'number_of_characters': 581335, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'mal_Mlym-smo_Latn': {'num_samples': 1997, 'number_of_characters': 582007, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'mal_Mlym-tah_Latn': {'num_samples': 1997, 'number_of_characters': 613775, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'mal_Mlym-ton_Latn': {'num_samples': 1997, 'number_of_characters': 617792, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'mar_Deva-ben_Beng': {'num_samples': 1997, 'number_of_characters': 504689, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'mar_Deva-div_Thaa': {'num_samples': 1997, 'number_of_characters': 564735, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'mar_Deva-eng_Latn': {'num_samples': 1997, 'number_of_characters': 508607, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mar_Deva-eus_Latn': {'num_samples': 1997, 'number_of_characters': 536090, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'mar_Deva-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 505818, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'mar_Deva-hin_Deva': {'num_samples': 1997, 'number_of_characters': 522462, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'mar_Deva-kan_Knda': {'num_samples': 1997, 'number_of_characters': 526423, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'mar_Deva-nep_Deva': {'num_samples': 1997, 'number_of_characters': 509110, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'mar_Deva-pan_Guru': {'num_samples': 1997, 'number_of_characters': 511309, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'mar_Deva-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 519628, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'mar_Deva-snd_Arab': {'num_samples': 1997, 'number_of_characters': 481214, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'mar_Deva-tam_Taml': {'num_samples': 1997, 'number_of_characters': 570848, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'mar_Deva-tel_Telu': {'num_samples': 1997, 'number_of_characters': 508414, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'mar_Deva-urd_Arab': {'num_samples': 1997, 'number_of_characters': 508885, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'mey_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 445016, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'mey_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 466202, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'mey_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 461555, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mey_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 457174, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'mey_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 414131, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'mey_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 459781, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'mey_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 456371, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'mey_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 456468, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'mey_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 428748, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'mey_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 487982, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'mkd_Cyrl-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 523981, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'mkd_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 522801, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'mkd_Cyrl-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 537780, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'mkd_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 509209, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'mkd_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 515611, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mkd_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 523816, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'mkd_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 545757, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'mkd_Cyrl-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 542784, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'mkd_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 520860, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'mkd_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 520787, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'mkd_Cyrl-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 520194, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'mkd_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 524032, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'mkd_Cyrl-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 530674, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'mlg_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 568028, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mlg_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 620813, 'unique_pairs': 1995, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'mlg_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 613728, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'mlg_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 607280, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'mlg_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 624460, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'mlg_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 594432, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'mlg_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 597491, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'mlg_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 598163, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'mlg_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 629931, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'mlg_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 633948, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'mlt_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 560435, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'mlt_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 525195, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mlt_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 570583, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'mlt_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 549461, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'mlt_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 566692, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'mlt_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 554690, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'mlt_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 569960, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'mlt_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 565675, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'mon_Mong-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 559677, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'mon_Mong-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 502269, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'mon_Mong-eng_Latn': {'num_samples': 1997, 'number_of_characters': 506768, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mon_Mong-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 547539, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'mon_Mong-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 526028, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'mon_Mong-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 570902, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'mon_Mong-tha_Thai': {'num_samples': 1997, 'number_of_characters': 496516, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'mri_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 521844, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mri_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 574629, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'mri_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 567544, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'mri_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 561096, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'mri_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 578276, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'mri_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 594432, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'mri_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 551307, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'mri_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 551979, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'mri_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 583747, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'mri_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 587764, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'msa_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 524903, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'msa_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 577688, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'msa_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 570603, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'msa_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 564155, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'msa_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 581335, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'msa_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 597491, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'msa_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 551307, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'msa_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 555038, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'msa_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 586806, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'msa_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 590823, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'mya_Mymr-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 612483, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'mya_Mymr-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 555075, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'mya_Mymr-eng_Latn': {'num_samples': 1997, 'number_of_characters': 559574, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mya_Mymr-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 600345, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'mya_Mymr-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 578834, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'mya_Mymr-mon_Mong': {'num_samples': 1997, 'number_of_characters': 570902, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'mya_Mymr-tha_Thai': {'num_samples': 1997, 'number_of_characters': 549322, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'nde_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 596231, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'nde_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 545459, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nde_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 536717, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'nde_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 526219, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'nde_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 601526, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'nde_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 582021, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'nde_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 596069, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'nde_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 597495, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'nep_Deva-ben_Beng': {'num_samples': 1997, 'number_of_characters': 492025, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'nep_Deva-div_Thaa': {'num_samples': 1997, 'number_of_characters': 552071, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'nep_Deva-eng_Latn': {'num_samples': 1997, 'number_of_characters': 495943, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nep_Deva-eus_Latn': {'num_samples': 1997, 'number_of_characters': 523426, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'nep_Deva-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 493154, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'nep_Deva-hin_Deva': {'num_samples': 1997, 'number_of_characters': 509798, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'nep_Deva-kan_Knda': {'num_samples': 1997, 'number_of_characters': 513759, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'nep_Deva-mar_Deva': {'num_samples': 1997, 'number_of_characters': 509110, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'nep_Deva-pan_Guru': {'num_samples': 1997, 'number_of_characters': 498645, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'nep_Deva-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 506964, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'nep_Deva-snd_Arab': {'num_samples': 1997, 'number_of_characters': 468550, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'nep_Deva-tam_Taml': {'num_samples': 1997, 'number_of_characters': 558184, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'nep_Deva-tel_Telu': {'num_samples': 1997, 'number_of_characters': 495750, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'nep_Deva-urd_Arab': {'num_samples': 1997, 'number_of_characters': 496221, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'nld_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 560267, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'nld_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 523096, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'nld_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 535717, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'nld_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 544053, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'nld_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 587565, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'nld_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 591042, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'nld_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 539635, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nld_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 549718, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'nld_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 535254, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'nld_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 561715, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'nld_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 585023, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'nld_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 492211, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'nld_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 553490, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'nld_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 570291, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'nld_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 578887, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'nld_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 554123, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'nld_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 403541, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'nld_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 425320, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'nld_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 551312, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'nld_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 572672, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'nld_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 540272, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'nld_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 543359, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'nld_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 569781, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'nld_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 569130, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'nld_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 566808, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'nld_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 580115, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'nld_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 564424, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'nld_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 543742, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'nld_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 601876, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'nld_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 557528, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'nld_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 563036, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'nld_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 383393, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'nld_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 549647, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'nno_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 516709, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'nno_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 500495, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'nno_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 544007, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'nno_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 496077, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nno_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 506160, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'nno_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 510565, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'nno_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 529114, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'nno_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 540272, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'nno_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 499801, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'nno_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 500184, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'nob_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 519796, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'nob_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 503582, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'nob_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 547094, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'nob_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 499164, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nob_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 509247, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'nob_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 513652, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'nob_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 532201, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'nob_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 543359, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'nob_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 499801, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'nob_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 503271, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'nso_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 459006, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'nso_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 539219, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nso_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 561465, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'nso_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 537600, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'nso_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 528930, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'nso_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 582791, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'nso_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 579641, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'nso_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 564008, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'nso_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 456737, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'nso_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 625782, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'nso_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 531302, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'nso_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 559589, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'nso_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 607587, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'nso_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 549231, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'nya_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 582774, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'nya_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 532002, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nya_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 523260, 'unique_pairs': 1995, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'nya_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 512762, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'nya_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 588069, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'nya_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 582021, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'nya_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 582612, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'nya_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 584038, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'orm_Ethi-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 404938, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'orm_Ethi-eng_Latn': {'num_samples': 1997, 'number_of_characters': 485151, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'orm_Ethi-hau_Latn': {'num_samples': 1997, 'number_of_characters': 507397, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'orm_Ethi-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 483532, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'orm_Ethi-nso_Latn': {'num_samples': 1997, 'number_of_characters': 528930, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'orm_Ethi-som_Latn': {'num_samples': 1997, 'number_of_characters': 528723, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'orm_Ethi-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 525573, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'orm_Ethi-swa_Latn': {'num_samples': 1997, 'number_of_characters': 509940, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'orm_Ethi-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 402669, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'orm_Ethi-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 571714, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'orm_Ethi-wol_Latn': {'num_samples': 1997, 'number_of_characters': 477234, 'unique_pairs': 1992, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'orm_Ethi-xho_Latn': {'num_samples': 1997, 'number_of_characters': 505521, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'orm_Ethi-yor_Latn': {'num_samples': 1997, 'number_of_characters': 553519, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'orm_Ethi-zul_Latn': {'num_samples': 1997, 'number_of_characters': 495163, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'pan_Guru-ben_Beng': {'num_samples': 1997, 'number_of_characters': 494224, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'pan_Guru-div_Thaa': {'num_samples': 1997, 'number_of_characters': 554270, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'pan_Guru-eng_Latn': {'num_samples': 1997, 'number_of_characters': 498142, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'pan_Guru-eus_Latn': {'num_samples': 1997, 'number_of_characters': 525625, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'pan_Guru-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 495353, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'pan_Guru-hin_Deva': {'num_samples': 1997, 'number_of_characters': 511997, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'pan_Guru-kan_Knda': {'num_samples': 1997, 'number_of_characters': 515958, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'pan_Guru-mar_Deva': {'num_samples': 1997, 'number_of_characters': 511309, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'pan_Guru-nep_Deva': {'num_samples': 1997, 'number_of_characters': 498645, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'pan_Guru-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 509163, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'pan_Guru-snd_Arab': {'num_samples': 1997, 'number_of_characters': 470749, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'pan_Guru-tam_Taml': {'num_samples': 1997, 'number_of_characters': 560383, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'pan_Guru-tel_Telu': {'num_samples': 1997, 'number_of_characters': 497949, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'pan_Guru-urd_Arab': {'num_samples': 1997, 'number_of_characters': 498420, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'pol_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 509047, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'pol_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 533956, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'pol_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 521668, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'pol_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 532776, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'pol_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 547755, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'pol_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 519184, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'pol_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 573516, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'pol_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 576993, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'pol_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 525586, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'pol_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 521205, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'pol_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 547666, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'pol_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 570974, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'pol_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 478162, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'pol_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 539441, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'pol_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 533791, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'pol_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 556242, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'pol_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 564838, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'pol_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 389492, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'pol_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 411271, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'pol_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 537263, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'pol_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 545757, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'pol_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 569781, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'pol_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 555081, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'pol_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 552759, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'pol_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 530835, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'pol_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 530762, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'pol_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 566066, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'pol_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 530169, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'pol_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 534007, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'pol_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 550375, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'pol_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 529693, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'pol_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 587827, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'pol_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 543479, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'pol_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 540649, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'pol_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 548987, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'pol_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 369344, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'pol_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 535598, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'por_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 508396, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'por_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 521017, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'por_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 560175, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'por_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 572865, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'por_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 576342, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'por_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 524935, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'por_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 520554, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'por_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 547015, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'por_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 570323, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'por_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 549201, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'por_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 477511, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'por_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 538790, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'por_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 555591, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'por_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 564187, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'por_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 566432, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'por_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 388841, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'por_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 410620, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'por_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 536612, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'por_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 554690, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'por_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 569130, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'por_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 555081, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'por_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 569700, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'por_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 552108, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'por_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 565415, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'por_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 549724, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'por_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 529042, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'por_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 587176, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'por_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 542828, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'por_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 548336, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'por_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 368693, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'por_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 534947, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'prs_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 473717, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'prs_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 494903, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'prs_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 490256, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'prs_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 485875, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'prs_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 442832, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'prs_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 488482, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'prs_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 456371, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'prs_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 485169, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'prs_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 457449, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'prs_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 516683, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'pus_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 473814, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'pus_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 495000, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'pus_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 490353, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'pus_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 485972, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'pus_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 442929, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'pus_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 488579, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'pus_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 456468, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'pus_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 485169, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'pus_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 457546, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'pus_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 516780, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'ron_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 575445, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'ron_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 540205, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ron_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 585593, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'ron_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 564471, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ron_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 581702, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'ron_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 569960, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'ron_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 569700, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'ron_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 580685, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'rus_Cyrl-arb_Arab': {'num_samples': 1997, 'number_of_characters': 506074, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'rus_Cyrl-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 530983, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'rus_Cyrl-ben_Beng': {'num_samples': 1997, 'number_of_characters': 518695, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'rus_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 529803, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'rus_Cyrl-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 544782, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'rus_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 516211, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'rus_Cyrl-deu_Latn': {'num_samples': 1997, 'number_of_characters': 570543, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'rus_Cyrl-ell_Grek': {'num_samples': 1997, 'number_of_characters': 574020, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'rus_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 522613, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'rus_Cyrl-fas_Arab': {'num_samples': 1997, 'number_of_characters': 518232, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'rus_Cyrl-fin_Latn': {'num_samples': 1997, 'number_of_characters': 544693, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'rus_Cyrl-fra_Latn': {'num_samples': 1997, 'number_of_characters': 568001, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'rus_Cyrl-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 475189, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'rus_Cyrl-hin_Deva': {'num_samples': 1997, 'number_of_characters': 536468, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'rus_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 530818, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'rus_Cyrl-hun_Latn': {'num_samples': 1997, 'number_of_characters': 553269, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'rus_Cyrl-ind_Latn': {'num_samples': 1997, 'number_of_characters': 561865, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'rus_Cyrl-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 386519, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'rus_Cyrl-kor_Hang': {'num_samples': 1997, 'number_of_characters': 408298, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'rus_Cyrl-lit_Latn': {'num_samples': 1997, 'number_of_characters': 534290, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'rus_Cyrl-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 542784, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'rus_Cyrl-nld_Latn': {'num_samples': 1997, 'number_of_characters': 566808, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'rus_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 552759, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'rus_Cyrl-por_Latn': {'num_samples': 1997, 'number_of_characters': 552108, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'rus_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 527862, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'rus_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 527789, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'rus_Cyrl-spa_Latn': {'num_samples': 1997, 'number_of_characters': 563093, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'rus_Cyrl-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 527196, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'rus_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 531034, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'rus_Cyrl-swa_Latn': {'num_samples': 1997, 'number_of_characters': 547402, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'rus_Cyrl-swe_Latn': {'num_samples': 1997, 'number_of_characters': 526720, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'rus_Cyrl-tam_Taml': {'num_samples': 1997, 'number_of_characters': 584854, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'rus_Cyrl-tur_Latn': {'num_samples': 1997, 'number_of_characters': 540506, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'rus_Cyrl-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 537676, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'rus_Cyrl-vie_Latn': {'num_samples': 1997, 'number_of_characters': 546014, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'rus_Cyrl-zho_Hant': {'num_samples': 1997, 'number_of_characters': 366371, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'rus_Cyrl-zul_Latn': {'num_samples': 1997, 'number_of_characters': 532625, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'shi_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 446094, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'shi_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 467280, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'shi_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 462633, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'shi_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 458252, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'shi_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 415209, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'shi_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 460859, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'shi_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 428748, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'shi_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 457449, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'shi_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 457546, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'shi_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 489060, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'sin_Sinh-ben_Beng': {'num_samples': 1997, 'number_of_characters': 502543, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'sin_Sinh-div_Thaa': {'num_samples': 1997, 'number_of_characters': 562589, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'sin_Sinh-eng_Latn': {'num_samples': 1997, 'number_of_characters': 506461, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'sin_Sinh-eus_Latn': {'num_samples': 1997, 'number_of_characters': 533944, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'sin_Sinh-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 503672, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'sin_Sinh-hin_Deva': {'num_samples': 1997, 'number_of_characters': 520316, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'sin_Sinh-kan_Knda': {'num_samples': 1997, 'number_of_characters': 524277, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'sin_Sinh-mar_Deva': {'num_samples': 1997, 'number_of_characters': 519628, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'sin_Sinh-nep_Deva': {'num_samples': 1997, 'number_of_characters': 506964, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'sin_Sinh-pan_Guru': {'num_samples': 1997, 'number_of_characters': 509163, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'sin_Sinh-snd_Arab': {'num_samples': 1997, 'number_of_characters': 479068, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'sin_Sinh-tam_Taml': {'num_samples': 1997, 'number_of_characters': 568702, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'sin_Sinh-tel_Telu': {'num_samples': 1997, 'number_of_characters': 506268, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'sin_Sinh-urd_Arab': {'num_samples': 1997, 'number_of_characters': 506739, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'slk_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 509059, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'slk_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 507879, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'slk_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 522858, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'slk_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 494287, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'slk_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 500689, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'slk_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 508894, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'slk_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 520860, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'slk_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 530835, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'slk_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 527862, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'slk_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 505865, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'slk_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 505272, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'slk_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 509110, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'slk_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 515752, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'slv_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 508986, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'slv_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 507806, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'slv_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 522785, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'slv_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 494214, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'slv_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 500616, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'slv_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 508821, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'slv_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 520787, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'slv_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 530762, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'slv_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 527789, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'slv_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 505865, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'slv_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 505199, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'slv_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 509037, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'slv_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 515679, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'smo_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 525575, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'smo_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 578360, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'smo_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 571275, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'smo_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 564827, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'smo_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 582007, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'smo_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 598163, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'smo_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 551979, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'smo_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 555038, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'smo_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 587478, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'smo_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 591495, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'sna_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 596822, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'sna_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 546050, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'sna_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 537308, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'sna_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 526810, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'sna_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 602117, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'sna_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 596069, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'sna_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 582612, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'sna_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 598086, 'unique_pairs': 1995, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'snd_Arab-ben_Beng': {'num_samples': 1997, 'number_of_characters': 464129, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'snd_Arab-div_Thaa': {'num_samples': 1997, 'number_of_characters': 524175, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'snd_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 468047, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'snd_Arab-eus_Latn': {'num_samples': 1997, 'number_of_characters': 495530, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'snd_Arab-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 465258, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'snd_Arab-hin_Deva': {'num_samples': 1997, 'number_of_characters': 481902, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'snd_Arab-kan_Knda': {'num_samples': 1997, 'number_of_characters': 485863, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'snd_Arab-mar_Deva': {'num_samples': 1997, 'number_of_characters': 481214, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'snd_Arab-nep_Deva': {'num_samples': 1997, 'number_of_characters': 468550, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'snd_Arab-pan_Guru': {'num_samples': 1997, 'number_of_characters': 470749, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'snd_Arab-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 479068, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'snd_Arab-tam_Taml': {'num_samples': 1997, 'number_of_characters': 530288, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'snd_Arab-tel_Telu': {'num_samples': 1997, 'number_of_characters': 467854, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'snd_Arab-urd_Arab': {'num_samples': 1997, 'number_of_characters': 468325, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'som_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 458799, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'som_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 539012, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'som_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 561258, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'som_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 537393, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'som_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 582791, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'som_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 528723, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'som_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 579434, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'som_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 563801, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'som_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 456530, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'som_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 625575, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'som_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 531095, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'som_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 559382, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'som_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 607380, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'som_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 549024, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'spa_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 519381, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'spa_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 532002, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'spa_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 571160, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'spa_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 583850, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'spa_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 587327, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'spa_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 535920, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'spa_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 531539, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'spa_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 558000, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'spa_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 581308, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'spa_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 560186, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'spa_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 488496, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'spa_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 549775, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'spa_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 566576, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'spa_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 575172, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'spa_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 577417, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'spa_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 399826, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'spa_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 421605, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'spa_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 547597, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'spa_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 565675, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'spa_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 580115, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'spa_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 566066, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'spa_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 565415, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'spa_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 580685, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'spa_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 563093, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'spa_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 560709, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'spa_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 540027, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'spa_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 598161, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'spa_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 553813, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'spa_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 559321, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'spa_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 379678, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'spa_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 545932, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'sqi_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 582734, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 142.02, 'max_sentence1_length': 461, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'sqi_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 531327, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 142.02, 'max_sentence1_length': 461, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'sqi_Latn-hye_Armn': {'num_samples': 1997, 'number_of_characters': 548322, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 142.02, 'max_sentence1_length': 461, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 132.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'sqi_Latn-kat_Geor': {'num_samples': 1997, 'number_of_characters': 550199, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 142.02, 'max_sentence1_length': 461, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 133.5, 'max_sentence2_length': 503, 'unique_sentence2': 1995}, 'srp_Cyrl-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 508393, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'srp_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 507213, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'srp_Cyrl-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 522192, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'srp_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 493621, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'srp_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 500023, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'srp_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 508228, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'srp_Cyrl-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 520194, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'srp_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 530169, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'srp_Cyrl-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 527196, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'srp_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 505272, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'srp_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 505199, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'srp_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 508444, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'srp_Cyrl-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 515086, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'srp_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 512231, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'srp_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 511051, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'srp_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 526030, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'srp_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 497459, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'srp_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 503861, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'srp_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 512066, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'srp_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 524032, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'srp_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 534007, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'srp_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 531034, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'srp_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 509110, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'srp_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 509037, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'srp_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 508444, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'srp_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 518924, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'ssw_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 455649, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'ssw_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 535862, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ssw_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 558108, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'ssw_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 534243, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'ssw_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 579641, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'ssw_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 525573, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'ssw_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 579434, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'ssw_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 560651, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'ssw_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 453380, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'ssw_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 622425, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'ssw_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 527945, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'ssw_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 556232, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'ssw_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 604230, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'ssw_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 545874, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'swa_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 440016, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'swa_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 503690, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'swa_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 516311, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'swa_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 568159, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'swa_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 571636, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'swa_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 520229, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'swa_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 515848, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'swa_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 542309, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'swa_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 565617, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'swa_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 542475, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'swa_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 472805, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'swa_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 534084, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'swa_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 550885, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'swa_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 518610, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'swa_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 559481, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'swa_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 384135, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'swa_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 405914, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'swa_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 531906, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'swa_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 564424, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'swa_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 564008, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'swa_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 509940, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'swa_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 550375, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'swa_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 549724, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'swa_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 547402, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'swa_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 563801, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'swa_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 560709, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'swa_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 560651, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'swa_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 524336, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'swa_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 582470, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'swa_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 437747, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'swa_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 606792, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'swa_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 538122, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'swa_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 543630, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'swa_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 512312, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'swa_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 540599, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'swa_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 588597, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'swa_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 363987, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'swa_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 530241, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'swe_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 520179, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'swe_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 483008, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'swe_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 495629, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'swe_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 503965, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'swe_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 547477, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'swe_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 550954, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'swe_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 499547, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'swe_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 509630, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'swe_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 495166, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'swe_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 521627, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'swe_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 544935, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'swe_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 452123, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'swe_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 513402, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'swe_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 530203, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'swe_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 538799, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'swe_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 514035, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'swe_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 363453, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'swe_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 385232, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'swe_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 511224, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'swe_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 532584, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'swe_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 543742, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'swe_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 500184, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'swe_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 503271, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'swe_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 529693, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'swe_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 529042, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'swe_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 526720, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'swe_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 540027, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'swe_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 524336, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'swe_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 561788, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'swe_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 517440, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'swe_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 522948, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'swe_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 343305, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'swe_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 509559, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'tah_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 557343, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tah_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 610128, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'tah_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 603043, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'tah_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 596595, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'tah_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 613775, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'tah_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 629931, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'tah_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 583747, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'tah_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 586806, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'tah_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 587478, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'tah_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 623263, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'tam_Taml-arb_Arab': {'num_samples': 1997, 'number_of_characters': 541142, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'tam_Taml-ben_Beng': {'num_samples': 1997, 'number_of_characters': 553763, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'tam_Taml-deu_Latn': {'num_samples': 1997, 'number_of_characters': 605611, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'tam_Taml-div_Thaa': {'num_samples': 1997, 'number_of_characters': 613809, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'tam_Taml-ell_Grek': {'num_samples': 1997, 'number_of_characters': 609088, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'tam_Taml-eng_Latn': {'num_samples': 1997, 'number_of_characters': 557681, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tam_Taml-eus_Latn': {'num_samples': 1997, 'number_of_characters': 585164, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'tam_Taml-fas_Arab': {'num_samples': 1997, 'number_of_characters': 553300, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'tam_Taml-fin_Latn': {'num_samples': 1997, 'number_of_characters': 579761, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'tam_Taml-fra_Latn': {'num_samples': 1997, 'number_of_characters': 603069, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'tam_Taml-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 554892, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'tam_Taml-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 510257, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'tam_Taml-hin_Deva': {'num_samples': 1997, 'number_of_characters': 571536, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'tam_Taml-hun_Latn': {'num_samples': 1997, 'number_of_characters': 588337, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'tam_Taml-ind_Latn': {'num_samples': 1997, 'number_of_characters': 596933, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'tam_Taml-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 421587, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'tam_Taml-kan_Knda': {'num_samples': 1997, 'number_of_characters': 575497, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'tam_Taml-kor_Hang': {'num_samples': 1997, 'number_of_characters': 443366, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'tam_Taml-lit_Latn': {'num_samples': 1997, 'number_of_characters': 569358, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'tam_Taml-mar_Deva': {'num_samples': 1997, 'number_of_characters': 570848, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'tam_Taml-nep_Deva': {'num_samples': 1997, 'number_of_characters': 558184, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'tam_Taml-nld_Latn': {'num_samples': 1997, 'number_of_characters': 601876, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'tam_Taml-pan_Guru': {'num_samples': 1997, 'number_of_characters': 560383, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'tam_Taml-pol_Latn': {'num_samples': 1997, 'number_of_characters': 587827, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'tam_Taml-por_Latn': {'num_samples': 1997, 'number_of_characters': 587176, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'tam_Taml-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 584854, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'tam_Taml-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 568702, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'tam_Taml-snd_Arab': {'num_samples': 1997, 'number_of_characters': 530288, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'tam_Taml-spa_Latn': {'num_samples': 1997, 'number_of_characters': 598161, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'tam_Taml-swa_Latn': {'num_samples': 1997, 'number_of_characters': 582470, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'tam_Taml-swe_Latn': {'num_samples': 1997, 'number_of_characters': 561788, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'tam_Taml-tel_Telu': {'num_samples': 1997, 'number_of_characters': 557488, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'tam_Taml-tur_Latn': {'num_samples': 1997, 'number_of_characters': 575574, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'tam_Taml-urd_Arab': {'num_samples': 1997, 'number_of_characters': 557959, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'tam_Taml-vie_Latn': {'num_samples': 1997, 'number_of_characters': 581082, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'tam_Taml-zho_Hant': {'num_samples': 1997, 'number_of_characters': 401439, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'tam_Taml-zul_Latn': {'num_samples': 1997, 'number_of_characters': 567693, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'tat_Cyrl-aze_Latn': {'num_samples': 1997, 'number_of_characters': 515560, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'tat_Cyrl-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 492252, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'tat_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 493646, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tat_Cyrl-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 506202, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'tat_Cyrl-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 496790, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'tat_Cyrl-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 531200, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'tat_Cyrl-tur_Latn': {'num_samples': 1997, 'number_of_characters': 511539, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'tat_Cyrl-uig_Arab': {'num_samples': 1997, 'number_of_characters': 556948, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'tat_Cyrl-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 539621, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'tel_Telu-ben_Beng': {'num_samples': 1997, 'number_of_characters': 491329, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'tel_Telu-div_Thaa': {'num_samples': 1997, 'number_of_characters': 551375, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'tel_Telu-eng_Latn': {'num_samples': 1997, 'number_of_characters': 495247, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tel_Telu-eus_Latn': {'num_samples': 1997, 'number_of_characters': 522730, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'tel_Telu-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 492458, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'tel_Telu-hin_Deva': {'num_samples': 1997, 'number_of_characters': 509102, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'tel_Telu-kan_Knda': {'num_samples': 1997, 'number_of_characters': 513063, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'tel_Telu-mar_Deva': {'num_samples': 1997, 'number_of_characters': 508414, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'tel_Telu-nep_Deva': {'num_samples': 1997, 'number_of_characters': 495750, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'tel_Telu-pan_Guru': {'num_samples': 1997, 'number_of_characters': 497949, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'tel_Telu-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 506268, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'tel_Telu-snd_Arab': {'num_samples': 1997, 'number_of_characters': 467854, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'tel_Telu-tam_Taml': {'num_samples': 1997, 'number_of_characters': 557488, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'tel_Telu-urd_Arab': {'num_samples': 1997, 'number_of_characters': 495525, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'tgk_Cyrl-arb_Arab': {'num_samples': 1997, 'number_of_characters': 505328, 'unique_pairs': 1995, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'tgk_Cyrl-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 526514, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'tgk_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 521867, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tgk_Cyrl-fas_Arab': {'num_samples': 1997, 'number_of_characters': 517486, 'unique_pairs': 1995, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'tgk_Cyrl-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 474443, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'tgk_Cyrl-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 520093, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'tgk_Cyrl-mey_Arab': {'num_samples': 1997, 'number_of_characters': 487982, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'tgk_Cyrl-prs_Arab': {'num_samples': 1997, 'number_of_characters': 516683, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'tgk_Cyrl-pus_Arab': {'num_samples': 1997, 'number_of_characters': 516780, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'tgk_Cyrl-shi_Arab': {'num_samples': 1997, 'number_of_characters': 489060, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'tha_Thai-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 538097, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'tha_Thai-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 480689, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'tha_Thai-eng_Latn': {'num_samples': 1997, 'number_of_characters': 485188, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tha_Thai-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 525959, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'tha_Thai-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 504448, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'tha_Thai-mon_Mong': {'num_samples': 1997, 'number_of_characters': 496516, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'tha_Thai-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 549322, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'tir_Ethi-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 332745, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'tir_Ethi-eng_Latn': {'num_samples': 1997, 'number_of_characters': 412958, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tir_Ethi-hau_Latn': {'num_samples': 1997, 'number_of_characters': 435204, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'tir_Ethi-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 411339, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'tir_Ethi-nso_Latn': {'num_samples': 1997, 'number_of_characters': 456737, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'tir_Ethi-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 402669, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'tir_Ethi-som_Latn': {'num_samples': 1997, 'number_of_characters': 456530, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'tir_Ethi-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 453380, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'tir_Ethi-swa_Latn': {'num_samples': 1997, 'number_of_characters': 437747, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'tir_Ethi-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 499521, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'tir_Ethi-wol_Latn': {'num_samples': 1997, 'number_of_characters': 405041, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'tir_Ethi-xho_Latn': {'num_samples': 1997, 'number_of_characters': 433328, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'tir_Ethi-yor_Latn': {'num_samples': 1997, 'number_of_characters': 481326, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'tir_Ethi-zul_Latn': {'num_samples': 1997, 'number_of_characters': 422970, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'ton_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 561360, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ton_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 614145, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'ton_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 607060, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'ton_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 600612, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'ton_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 617792, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'ton_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 633948, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'ton_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 587764, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'ton_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 590823, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'ton_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 591495, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'ton_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 623263, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'tsn_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 501790, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'tsn_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 582003, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tsn_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 604249, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'tsn_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 580384, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'tsn_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 625782, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'tsn_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 571714, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'tsn_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 625575, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'tsn_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 622425, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'tsn_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 606792, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'tsn_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 499521, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'tsn_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 574086, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'tsn_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 602373, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'tsn_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 650371, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'tsn_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 592015, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'tuk_Latn-aze_Latn': {'num_samples': 1997, 'number_of_characters': 554908, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'tuk_Latn-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 531600, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'tuk_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 532994, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tuk_Latn-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 545550, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'tuk_Latn-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 536138, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'tuk_Latn-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 531200, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'tuk_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 550887, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'tuk_Latn-uig_Arab': {'num_samples': 1997, 'number_of_characters': 596296, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'tuk_Latn-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 578969, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'tur_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 496794, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'tur_Latn-aze_Latn': {'num_samples': 1997, 'number_of_characters': 535247, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'tur_Latn-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 511939, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'tur_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 509415, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'tur_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 561263, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'tur_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 564740, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'tur_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 513333, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tur_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 508952, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'tur_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 535413, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'tur_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 558721, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'tur_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 465909, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'tur_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 527188, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'tur_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 543989, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'tur_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 552585, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'tur_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 377239, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'tur_Latn-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 525889, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'tur_Latn-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 516477, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'tur_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 399018, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'tur_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 525010, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'tur_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 557528, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'tur_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 543479, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'tur_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 542828, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'tur_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 540506, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'tur_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 553813, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'tur_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 538122, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'tur_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 517440, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'tur_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 575574, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'tur_Latn-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 511539, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'tur_Latn-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 550887, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'tur_Latn-uig_Arab': {'num_samples': 1997, 'number_of_characters': 576635, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'tur_Latn-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 559308, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'tur_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 536734, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'tur_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 357091, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'tur_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 523345, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'uig_Arab-aze_Latn': {'num_samples': 1997, 'number_of_characters': 580656, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'uig_Arab-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 557348, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'uig_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 558742, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'uig_Arab-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 571298, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'uig_Arab-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 561886, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'uig_Arab-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 556948, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'uig_Arab-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 596296, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'uig_Arab-tur_Latn': {'num_samples': 1997, 'number_of_characters': 576635, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'uig_Arab-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 604717, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'ukr_Cyrl-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 518873, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'ukr_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 517693, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'ukr_Cyrl-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 532672, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'ukr_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 504101, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'ukr_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 510503, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ukr_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 518708, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'ukr_Cyrl-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 530674, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'ukr_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 540649, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ukr_Cyrl-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 537676, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'ukr_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 515752, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'ukr_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 515679, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'ukr_Cyrl-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 515086, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'ukr_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 518924, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'urd_Arab-ben_Beng': {'num_samples': 1997, 'number_of_characters': 491800, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'urd_Arab-div_Thaa': {'num_samples': 1997, 'number_of_characters': 551846, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'urd_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 495718, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'urd_Arab-eus_Latn': {'num_samples': 1997, 'number_of_characters': 523201, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'urd_Arab-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 492929, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'urd_Arab-hin_Deva': {'num_samples': 1997, 'number_of_characters': 509573, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'urd_Arab-kan_Knda': {'num_samples': 1997, 'number_of_characters': 513534, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'urd_Arab-mar_Deva': {'num_samples': 1997, 'number_of_characters': 508885, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'urd_Arab-nep_Deva': {'num_samples': 1997, 'number_of_characters': 496221, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'urd_Arab-pan_Guru': {'num_samples': 1997, 'number_of_characters': 498420, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'urd_Arab-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 506739, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'urd_Arab-snd_Arab': {'num_samples': 1997, 'number_of_characters': 468325, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'urd_Arab-tam_Taml': {'num_samples': 1997, 'number_of_characters': 557959, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'urd_Arab-tel_Telu': {'num_samples': 1997, 'number_of_characters': 495525, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'uzb_Latn-aze_Latn': {'num_samples': 1997, 'number_of_characters': 563329, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'uzb_Latn-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 540021, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'uzb_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 541415, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'uzb_Latn-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 553971, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'uzb_Latn-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 544559, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'uzb_Latn-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 539621, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'uzb_Latn-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 578969, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'uzb_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 559308, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'uzb_Latn-uig_Arab': {'num_samples': 1997, 'number_of_characters': 604717, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'ven_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 598248, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'ven_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 547476, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ven_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 538734, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'ven_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 528236, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'ven_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 603543, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'ven_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 597495, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'ven_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 584038, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'ven_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 598086, 'unique_pairs': 1995, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'vie_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 502302, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'vie_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 514923, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'vie_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 566771, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'vie_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 570248, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'vie_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 518841, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'vie_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 514460, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'vie_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 540921, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'vie_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 564229, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'vie_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 471417, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'vie_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 532696, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'vie_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 549497, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'vie_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 558093, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'vie_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 382747, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'vie_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 404526, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'vie_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 530518, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'vie_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 563036, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'vie_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 548987, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'vie_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 548336, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'vie_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 546014, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'vie_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 559321, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'vie_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 543630, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'vie_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 522948, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'vie_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 581082, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'vie_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 536734, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'vie_Latn-yue_Hant': {'num_samples': 1997, 'number_of_characters': 350008, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'vie_Latn-zho_Hans': {'num_samples': 1997, 'number_of_characters': 356082, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'vie_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 362599, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'vie_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 528853, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'wol_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 407310, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'wol_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 487523, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'wol_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 509769, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'wol_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 485904, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'wol_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 531302, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'wol_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 477234, 'unique_pairs': 1992, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'wol_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 531095, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'wol_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 527945, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'wol_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 512312, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'wol_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 405041, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'wol_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 574086, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'wol_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 507893, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'wol_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 555891, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'wol_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 497535, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'xho_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 435597, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'xho_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 515810, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'xho_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 538056, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'xho_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 514191, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'xho_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 559589, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'xho_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 505521, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'xho_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 559382, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'xho_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 556232, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'xho_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 540599, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'xho_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 433328, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'xho_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 602373, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'xho_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 507893, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'xho_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 584178, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'xho_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 525822, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'yor_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 483595, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'yor_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 563808, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'yor_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 586054, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'yor_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 562189, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'yor_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 607587, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'yor_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 553519, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'yor_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 607380, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'yor_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 604230, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'yor_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 588597, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'yor_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 481326, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'yor_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 650371, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'yor_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 555891, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'yor_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 584178, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'yor_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 573820, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'yue_Hant-eng_Latn': {'num_samples': 1997, 'number_of_characters': 326607, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'yue_Hant-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 190513, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'yue_Hant-kor_Hang': {'num_samples': 1997, 'number_of_characters': 212292, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'yue_Hant-vie_Latn': {'num_samples': 1997, 'number_of_characters': 350008, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'yue_Hant-zho_Hans': {'num_samples': 1997, 'number_of_characters': 163848, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'yue_Hant-zho_Hant': {'num_samples': 1997, 'number_of_characters': 170365, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'zho_Hans-eng_Latn': {'num_samples': 1997, 'number_of_characters': 332681, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'zho_Hans-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 196587, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'zho_Hans-kor_Hang': {'num_samples': 1997, 'number_of_characters': 218366, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'zho_Hans-vie_Latn': {'num_samples': 1997, 'number_of_characters': 356082, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'zho_Hans-yue_Hant': {'num_samples': 1997, 'number_of_characters': 163848, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'zho_Hans-zho_Hant': {'num_samples': 1997, 'number_of_characters': 176439, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'zho_Hant-arb_Arab': {'num_samples': 1997, 'number_of_characters': 322659, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'zho_Hant-ben_Beng': {'num_samples': 1997, 'number_of_characters': 335280, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'zho_Hant-deu_Latn': {'num_samples': 1997, 'number_of_characters': 387128, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'zho_Hant-ell_Grek': {'num_samples': 1997, 'number_of_characters': 390605, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'zho_Hant-eng_Latn': {'num_samples': 1997, 'number_of_characters': 339198, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'zho_Hant-fas_Arab': {'num_samples': 1997, 'number_of_characters': 334817, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'zho_Hant-fin_Latn': {'num_samples': 1997, 'number_of_characters': 361278, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'zho_Hant-fra_Latn': {'num_samples': 1997, 'number_of_characters': 384586, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'zho_Hant-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 291774, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'zho_Hant-hin_Deva': {'num_samples': 1997, 'number_of_characters': 353053, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'zho_Hant-hun_Latn': {'num_samples': 1997, 'number_of_characters': 369854, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'zho_Hant-ind_Latn': {'num_samples': 1997, 'number_of_characters': 378450, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'zho_Hant-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 203104, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'zho_Hant-kor_Hang': {'num_samples': 1997, 'number_of_characters': 224883, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'zho_Hant-lit_Latn': {'num_samples': 1997, 'number_of_characters': 350875, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'zho_Hant-nld_Latn': {'num_samples': 1997, 'number_of_characters': 383393, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'zho_Hant-pol_Latn': {'num_samples': 1997, 'number_of_characters': 369344, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'zho_Hant-por_Latn': {'num_samples': 1997, 'number_of_characters': 368693, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'zho_Hant-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 366371, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'zho_Hant-spa_Latn': {'num_samples': 1997, 'number_of_characters': 379678, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'zho_Hant-swa_Latn': {'num_samples': 1997, 'number_of_characters': 363987, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'zho_Hant-swe_Latn': {'num_samples': 1997, 'number_of_characters': 343305, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'zho_Hant-tam_Taml': {'num_samples': 1997, 'number_of_characters': 401439, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'zho_Hant-tur_Latn': {'num_samples': 1997, 'number_of_characters': 357091, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'zho_Hant-vie_Latn': {'num_samples': 1997, 'number_of_characters': 362599, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'zho_Hant-yue_Hant': {'num_samples': 1997, 'number_of_characters': 170365, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'zho_Hant-zho_Hans': {'num_samples': 1997, 'number_of_characters': 176439, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'zho_Hant-zul_Latn': {'num_samples': 1997, 'number_of_characters': 349210, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'zul_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 425239, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'zul_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 488913, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'zul_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 501534, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'zul_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 553382, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'zul_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 556859, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'zul_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 505452, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'zul_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 501071, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'zul_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 527532, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'zul_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 550840, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'zul_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 527698, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'zul_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 458028, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'zul_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 519307, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'zul_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 536108, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'zul_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 503833, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'zul_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 544704, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'zul_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 369358, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'zul_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 391137, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'zul_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 517129, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'zul_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 549647, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'zul_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 549231, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'zul_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 495163, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'zul_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 535598, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'zul_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 534947, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'zul_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 532625, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'zul_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 549024, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'zul_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 545932, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'zul_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 545874, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'zul_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 530241, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'zul_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 509559, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'zul_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 567693, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'zul_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 422970, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'zul_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 592015, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'zul_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 523345, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'zul_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 528853, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'zul_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 497535, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'zul_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 525822, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'zul_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 573820, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'zul_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 349210, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}}}} | | [NYSJudicialEthicsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | @@ -529,19 +617,19 @@ The following tables give you an overview of the tasks in MTEB. | [NanoSciFactRetrieval](https://github.com/allenai/scifact) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [NanoTouche2020Retrieval](https://webis.de/events/touche-20/shared-task-1.html) (Potthast et al., 2022) | ['eng'] | Retrieval | s2p | [Academic] | None | None | | [NarrativeQARetrieval](https://metatext.io/datasets/narrativeqa) (Tomáš Kočiský, 2017) | ['eng'] | Retrieval | s2p | | None | None | -| [NepaliNewsClassification](https://github.com/goru001/nlp-for-nepali) (Arora et al., 2020) | ['nep'] | Classification | s2s | [News, Written] | None | None | +| [NepaliNewsClassification.v2](https://github.com/goru001/nlp-for-nepali) (Arora et al., 2020) | ['nep'] | Classification | s2s | [News, Written] | None | None | | [NeuCLIR2022Retrieval](https://neuclir.github.io/) (Lawrie et al., 2023) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | None | | [NeuCLIR2022RetrievalHardNegatives](https://neuclir.github.io/) (Lawrie et al., 2023) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | None | | [NeuCLIR2023Retrieval](https://neuclir.github.io/) (Dawn Lawrie, 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | None | | [NeuCLIR2023RetrievalHardNegatives](https://neuclir.github.io/) (Dawn Lawrie, 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | None | | [News21InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionRetrieval | s2p | [News, Written] | None | None | -| [NewsClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [News, Written] | None | None | -| [NoRecClassification](https://aclanthology.org/L18-1661/) (Velldal et al., 2018) | ['nob'] | Classification | s2s | [Reviews, Written] | None | None | +| [NewsClassification.v2](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [News, Written] | None | None | +| [NoRecClassification.v2](https://aclanthology.org/L18-1661/) (Velldal et al., 2018) | ['nob'] | Classification | s2s | [Reviews, Written] | None | None | | [NollySentiBitextMining](https://github.com/IyanuSh/NollySenti) (Shode et al., 2023) | ['eng', 'hau', 'ibo', 'pcm', 'yor'] | BitextMining | s2s | [Reviews, Social, Written] | {'train': 1640} | {'train': {'num_samples': 1640, 'number_of_characters': 445805, 'unique_pairs': 1632, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 3, 'average_sentence2_length': 135.52, 'max_sentence2_length': 1728, 'unique_sentence2': 1631, 'hf_subset_descriptive_stats': {'en-ha': {'num_samples': 410, 'number_of_characters': 115348, 'unique_pairs': 407, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 4, 'average_sentence2_length': 145.02, 'max_sentence2_length': 1728, 'unique_sentence2': 407}, 'en-ig': {'num_samples': 410, 'number_of_characters': 107173, 'unique_pairs': 409, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 5, 'average_sentence2_length': 125.08, 'max_sentence2_length': 1137, 'unique_sentence2': 408}, 'en-pcm': {'num_samples': 410, 'number_of_characters': 109955, 'unique_pairs': 408, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 3, 'average_sentence2_length': 131.87, 'max_sentence2_length': 1552, 'unique_sentence2': 408}, 'en-yo': {'num_samples': 410, 'number_of_characters': 113329, 'unique_pairs': 409, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 6, 'average_sentence2_length': 140.1, 'max_sentence2_length': 1338, 'unique_sentence2': 409}}}} | | [NorQuadRetrieval](https://aclanthology.org/2023.nodalida-1.17/) (Ivanova et al., 2023) | ['nob'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Written] | None | None | | [NordicLangClassification](https://aclanthology.org/2021.vardial-1.8/) (Haas et al., 2021) | ['dan', 'fao', 'isl', 'nno', 'nob', 'swe'] | Classification | s2s | [Encyclopaedic] | None | None | | [NorwegianCourtsBitextMining](https://opus.nlpl.eu/index.php) (Tiedemann et al., 2020) | ['nno', 'nob'] | BitextMining | s2s | [Legal, Written] | {'test': 228} | {'test': {'num_samples': 228, 'number_of_characters': 37441, 'unique_pairs': 228, 'min_sentence1_length': 13, 'average_sentence1_length': 82.2, 'max_sentence1_length': 272, 'unique_sentence1': 227, 'min_sentence2_length': 10, 'average_sentence2_length': 82.02, 'max_sentence2_length': 269, 'unique_sentence2': 226}} | -| [NorwegianParliamentClassification](https://huggingface.co/datasets/NbAiLab/norwegian_parliament) (Kummervold et al., 2021) | ['nob'] | Classification | s2s | [Government, Spoken] | None | None | +| [NorwegianParliamentClassification.v2](https://huggingface.co/datasets/NbAiLab/norwegian_parliament) (Kummervold et al., 2021) | ['nob'] | Classification | s2s | [Government, Spoken] | None | None | | [NusaParagraphEmotionClassification](https://github.com/IndoNLP/nusa-writes) (Cahyawijaya et al., 2023) | ['bbc', 'bew', 'bug', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | Classification | s2s | [Fiction, Non-fiction, Written] | None | None | | [NusaParagraphTopicClassification](https://github.com/IndoNLP/nusa-writes) (Cahyawijaya et al., 2023) | ['bbc', 'bew', 'bug', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | Classification | s2s | [Fiction, Non-fiction, Written] | None | None | | [NusaTranslationBitextMining](https://huggingface.co/datasets/indonlp/nusatranslation_mt) (Cahyawijaya et al., 2023) | ['abs', 'bbc', 'bew', 'bhp', 'ind', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | BitextMining | s2s | [Social, Written] | {'train': 50200} | {'train': {'num_samples': 50200, 'number_of_characters': 14759870, 'unique_pairs': 50140, 'min_sentence1_length': 5, 'average_sentence1_length': 145.46, 'max_sentence1_length': 873, 'unique_sentence1': 8258, 'min_sentence2_length': 5, 'average_sentence2_length': 148.57, 'max_sentence2_length': 980, 'unique_sentence2': 50102, 'hf_subset_descriptive_stats': {'ind-abs': {'num_samples': 1000, 'number_of_characters': 295680, 'unique_pairs': 999, 'min_sentence1_length': 5, 'average_sentence1_length': 148.37, 'max_sentence1_length': 727, 'unique_sentence1': 998, 'min_sentence2_length': 6, 'average_sentence2_length': 147.31, 'max_sentence2_length': 629, 'unique_sentence2': 998}, 'ind-btk': {'num_samples': 6600, 'number_of_characters': 1927907, 'unique_pairs': 6597, 'min_sentence1_length': 5, 'average_sentence1_length': 145.37, 'max_sentence1_length': 873, 'unique_sentence1': 6521, 'min_sentence2_length': 5, 'average_sentence2_length': 146.74, 'max_sentence2_length': 980, 'unique_sentence2': 6596}, 'ind-bew': {'num_samples': 6600, 'number_of_characters': 1939300, 'unique_pairs': 6595, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 6, 'average_sentence2_length': 148.41, 'max_sentence2_length': 840, 'unique_sentence2': 6590}, 'ind-bhp': {'num_samples': 1000, 'number_of_characters': 261666, 'unique_pairs': 1000, 'min_sentence1_length': 11, 'average_sentence1_length': 133.53, 'max_sentence1_length': 468, 'unique_sentence1': 999, 'min_sentence2_length': 10, 'average_sentence2_length': 128.14, 'max_sentence2_length': 459, 'unique_sentence2': 999}, 'ind-jav': {'num_samples': 6600, 'number_of_characters': 1922162, 'unique_pairs': 6594, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 5, 'average_sentence2_length': 145.81, 'max_sentence2_length': 854, 'unique_sentence2': 6585}, 'ind-mad': {'num_samples': 6600, 'number_of_characters': 1973257, 'unique_pairs': 6598, 'min_sentence1_length': 5, 'average_sentence1_length': 145.36, 'max_sentence1_length': 873, 'unique_sentence1': 6521, 'min_sentence2_length': 5, 'average_sentence2_length': 153.62, 'max_sentence2_length': 827, 'unique_sentence2': 6592}, 'ind-mak': {'num_samples': 6600, 'number_of_characters': 1953868, 'unique_pairs': 6594, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 6, 'average_sentence2_length': 150.61, 'max_sentence2_length': 888, 'unique_sentence2': 6586}, 'ind-min': {'num_samples': 6600, 'number_of_characters': 1937033, 'unique_pairs': 6595, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 6, 'average_sentence2_length': 148.06, 'max_sentence2_length': 837, 'unique_sentence2': 6591}, 'ind-mui': {'num_samples': 1000, 'number_of_characters': 301448, 'unique_pairs': 1000, 'min_sentence1_length': 11, 'average_sentence1_length': 150.45, 'max_sentence1_length': 451, 'unique_sentence1': 997, 'min_sentence2_length': 11, 'average_sentence2_length': 150.99, 'max_sentence2_length': 450, 'unique_sentence2': 1000}, 'ind-rej': {'num_samples': 1000, 'number_of_characters': 291205, 'unique_pairs': 1000, 'min_sentence1_length': 9, 'average_sentence1_length': 151.62, 'max_sentence1_length': 873, 'unique_sentence1': 998, 'min_sentence2_length': 8, 'average_sentence2_length': 139.58, 'max_sentence2_length': 784, 'unique_sentence2': 1000}, 'ind-sun': {'num_samples': 6600, 'number_of_characters': 1956344, 'unique_pairs': 6591, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 5, 'average_sentence2_length': 150.99, 'max_sentence2_length': 881, 'unique_sentence2': 6588}}}} | @@ -549,27 +637,27 @@ The following tables give you an overview of the tasks in MTEB. | [NusaXBitextMining](https://huggingface.co/datasets/indonlp/NusaX-senti/) (Winata et al., 2023) | ['ace', 'ban', 'bbc', 'bjn', 'bug', 'eng', 'ind', 'jav', 'mad', 'min', 'nij', 'sun'] | BitextMining | s2s | [Reviews, Written] | None | None | | [OKVQAIT2TRetrieval](https://okvqa.allenai.org/) (Marino et al., 2019) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | {'test': 119562} | {'test': {'number_of_characters': 72551583, 'num_samples': 119562, 'num_queries': 5046, 'num_documents': 114516, 'min_document_length': 234, 'average_document_length': 631.71, 'max_document_length': 2112, 'unique_documents': 114516, 'num_document_images': 0, 'min_query_length': 11, 'average_query_length': 41.71, 'max_query_length': 155, 'unique_queries': 4624, 'num_query_images': 5046, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 7.43, 'max_relevant_docs_per_query': 9, 'unique_relevant_docs': 19941}} | | [OPP115DataRetentionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [OPP115DataSecurityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [OPP115DoNotTrackLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [OPP115DataSecurityLegalBenchClassification.v2](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [OPP115DoNotTrackLegalBenchClassification.v2](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [OPP115FirstPartyCollectionUseLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [OPP115InternationalAndSpecificAudiencesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [OPP115PolicyChangeLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [OPP115ThirdPartySharingCollectionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [OPP115UserAccessEditAndDeletionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [OPP115UserChoiceControlLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [OPP115UserChoiceControlLegalBenchClassification.v2](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [OVENIT2ITRetrieval](https://openaccess.thecvf.com/content/ICCV2023/html/Hu_Open-domain_Visual_Entity_Recognition_Towards_Recognizing_Millions_of_Wikipedia_Entities_ICCV_2023_paper.html) (Hu et al., 2023) | ['eng'] | Any2AnyRetrieval | it2it | [Encyclopaedic] | {'test': 349876} | {'test': {'number_of_characters': 199201924, 'num_samples': 349876, 'num_queries': 14741, 'num_documents': 335135, 'min_document_length': 11, 'average_document_length': 593.1, 'max_document_length': 2531, 'unique_documents': 335135, 'num_document_images': 335135, 'min_query_length': 11, 'average_query_length': 29.36, 'max_query_length': 100, 'unique_queries': 520, 'num_query_images': 14741, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 17.72, 'max_relevant_docs_per_query': 128, 'unique_relevant_docs': 14284}} | | [OVENIT2TRetrieval](https://openaccess.thecvf.com/content/ICCV2023/html/Hu_Open-domain_Visual_Entity_Recognition_Towards_Recognizing_Millions_of_Wikipedia_Entities_ICCV_2023_paper.html) (Hu et al., 2023) | ['eng'] | Any2AnyRetrieval | it2i | [Encyclopaedic] | {'test': 726671} | {'test': {'number_of_characters': 356548734, 'num_samples': 726671, 'num_queries': 50004, 'num_documents': 676667, 'min_document_length': 2, 'average_document_length': 524.5, 'max_document_length': 60546, 'unique_documents': 676667, 'num_document_images': 0, 'min_query_length': 11, 'average_query_length': 32.77, 'max_query_length': 78, 'unique_queries': 1540, 'num_query_images': 50004, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 9.85, 'max_relevant_docs_per_query': 174, 'unique_relevant_docs': 24471}} | | [Ocnli](https://arxiv.org/abs/2010.05444) (Hai Hu, 2020) | ['cmn'] | PairClassification | s2s | | None | None | -| [OdiaNewsClassification](https://github.com/goru001/nlp-for-odia) (Anoop Kunchukuttan, 2020) | ['ory'] | Classification | s2s | [News, Written] | None | None | +| [OdiaNewsClassification.v2](https://github.com/goru001/nlp-for-odia) (Anoop Kunchukuttan, 2020) | ['ory'] | Classification | s2s | [News, Written] | None | None | | [OnlineShopping](https://aclanthology.org/2023.nodalida-1.20/) (Xiao et al., 2023) | ['cmn'] | Classification | s2s | | None | None | | [OnlineStoreReviewSentimentClassification.v2](https://huggingface.co/datasets/Ruqiya/Arabic_Reviews_of_SHEIN) | ['ara'] | Classification | s2s | [Reviews, Written] | None | None | | [OpusparcusPC](https://gem-benchmark.com/data_cards/opusparcus) (Mathias Creutz, 2018) | ['deu', 'eng', 'fin', 'fra', 'rus', 'swe'] | PairClassification | s2s | [Spoken, Spoken] | None | None | -| [OralArgumentQuestionPurposeLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [OverrulingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [OralArgumentQuestionPurposeLegalBenchClassification.v2](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [OverrulingLegalBenchClassification.v2](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [OxfordFlowersClassification](https://huggingface.co/datasets/nelorth/oxford-flowers/viewer/default/train) (Nilsback et al., 2008) | ['eng'] | ImageClassification | i2i | [Reviews] | {'test': 1020} | {'test': {'num_samples': 1020, 'unique_num_labels': 102, 'min_image_width': 500, 'average_image_width': 618.07, 'max_image_width': 873, 'min_image_height': 500, 'average_image_height': 538.26, 'max_image_height': 928, 'labels': {'0': {'count': 9}, '1': {'count': 9}, '2': {'count': 10}, '3': {'count': 9}, '4': {'count': 11}, '5': {'count': 11}, '6': {'count': 10}, '7': {'count': 10}, '8': {'count': 11}, '9': {'count': 10}, '10': {'count': 10}, '11': {'count': 9}, '12': {'count': 10}, '13': {'count': 10}, '14': {'count': 10}, '15': {'count': 9}, '16': {'count': 11}, '17': {'count': 11}, '18': {'count': 10}, '19': {'count': 9}, '20': {'count': 9}, '21': {'count': 10}, '22': {'count': 11}, '23': {'count': 11}, '24': {'count': 10}, '25': {'count': 11}, '26': {'count': 10}, '27': {'count': 9}, '28': {'count': 11}, '29': {'count': 10}, '30': {'count': 10}, '31': {'count': 9}, '32': {'count': 10}, '33': {'count': 10}, '34': {'count': 10}, '35': {'count': 11}, '36': {'count': 9}, '37': {'count': 10}, '38': {'count': 10}, '39': {'count': 11}, '40': {'count': 10}, '41': {'count': 10}, '42': {'count': 11}, '43': {'count': 10}, '44': {'count': 10}, '45': {'count': 10}, '46': {'count': 10}, '47': {'count': 9}, '48': {'count': 10}, '49': {'count': 11}, '50': {'count': 10}, '51': {'count': 10}, '52': {'count': 10}, '53': {'count': 10}, '54': {'count': 10}, '55': {'count': 10}, '56': {'count': 10}, '57': {'count': 11}, '58': {'count': 10}, '59': {'count': 10}, '60': {'count': 10}, '61': {'count': 10}, '62': {'count': 9}, '63': {'count': 10}, '64': {'count': 10}, '65': {'count': 9}, '66': {'count': 11}, '67': {'count': 10}, '68': {'count': 11}, '69': {'count': 9}, '70': {'count': 9}, '71': {'count': 10}, '72': {'count': 10}, '73': {'count': 10}, '74': {'count': 10}, '75': {'count': 10}, '76': {'count': 10}, '77': {'count': 10}, '78': {'count': 9}, '79': {'count': 10}, '80': {'count': 10}, '81': {'count': 11}, '82': {'count': 10}, '83': {'count': 9}, '84': {'count': 11}, '85': {'count': 10}, '86': {'count': 10}, '87': {'count': 10}, '88': {'count': 10}, '89': {'count': 10}, '90': {'count': 10}, '91': {'count': 10}, '92': {'count': 10}, '93': {'count': 9}, '94': {'count': 9}, '95': {'count': 10}, '96': {'count': 10}, '97': {'count': 10}, '98': {'count': 10}, '99': {'count': 10}, '100': {'count': 10}, '101': {'count': 11}}}} | | [OxfordPets](https://ieeexplore.ieee.org/abstract/document/6248092) (Parkhi et al., 2012) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 3669} | {'test': {'num_samples': 3669, 'unique_num_labels': 37, 'min_image_width': 137, 'average_image_width': 443.46, 'max_image_width': 1646, 'min_image_height': 103, 'average_image_height': 399.38, 'max_image_height': 2160, 'labels': {'0': {'count': 98}, '1': {'count': 100}, '2': {'count': 100}, '3': {'count': 100}, '4': {'count': 100}, '5': {'count': 100}, '6': {'count': 100}, '7': {'count': 88}, '8': {'count': 99}, '9': {'count': 100}, '10': {'count': 100}, '11': {'count': 97}, '12': {'count': 100}, '13': {'count': 100}, '14': {'count': 100}, '15': {'count': 100}, '16': {'count': 100}, '17': {'count': 100}, '18': {'count': 99}, '19': {'count': 100}, '20': {'count': 100}, '21': {'count': 100}, '22': {'count': 100}, '23': {'count': 100}, '24': {'count': 100}, '25': {'count': 100}, '26': {'count': 100}, '27': {'count': 100}, '28': {'count': 100}, '29': {'count': 100}, '30': {'count': 99}, '31': {'count': 100}, '32': {'count': 100}, '33': {'count': 100}, '34': {'count': 89}, '35': {'count': 100}, '36': {'count': 100}}}} | | [OxfordPetsZeroShot](https://arxiv.org/abs/1306.5151) (Subhransu Maji, 2013) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | {'test': 3669} | {'test': {'num_samples': 3669, 'unique_num_labels': 37, 'min_image_width': 137, 'average_image_width': 443.46, 'max_image_width': 1646, 'min_image_height': 103, 'average_image_height': 399.38, 'max_image_height': 2160, 'min_label_text_length': 32, 'average_label_text_length': 40.68, 'max_label_text_length': 55, 'labels': {'0': {'count': 98}, '1': {'count': 100}, '2': {'count': 100}, '3': {'count': 100}, '4': {'count': 100}, '5': {'count': 100}, '6': {'count': 100}, '7': {'count': 88}, '8': {'count': 99}, '9': {'count': 100}, '10': {'count': 100}, '11': {'count': 97}, '12': {'count': 100}, '13': {'count': 100}, '14': {'count': 100}, '15': {'count': 100}, '16': {'count': 100}, '17': {'count': 100}, '18': {'count': 99}, '19': {'count': 100}, '20': {'count': 100}, '21': {'count': 100}, '22': {'count': 100}, '23': {'count': 100}, '24': {'count': 100}, '25': {'count': 100}, '26': {'count': 100}, '27': {'count': 100}, '28': {'count': 100}, '29': {'count': 100}, '30': {'count': 99}, '31': {'count': 100}, '32': {'count': 100}, '33': {'count': 100}, '34': {'count': 89}, '35': {'count': 100}, '36': {'count': 100}}}} | -| [PAC](https://arxiv.org/pdf/2211.13112.pdf) (Łukasz Augustyniak, 2022) | ['pol'] | Classification | p2p | [Legal, Written] | None | None | +| [PAC.v2](https://arxiv.org/pdf/2211.13112.pdf) (Łukasz Augustyniak, 2022) | ['pol'] | Classification | p2p | [Legal, Written] | None | None | | [PAWSX](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None | | [PIQA](https://arxiv.org/abs/1911.11641) (Bisk et al., 2020) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [PROALegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | @@ -578,18 +666,18 @@ The following tables give you an overview of the tasks in MTEB. | [ParsinluQueryParaphPC](https://huggingface.co/datasets/persiannlp/parsinlu_query_paraphrasing) (Daniel Khashabi, 2021) | ['fas'] | PairClassification | s2s | [Reviews, Written] | None | None | | [PatchCamelyon](https://link.springer.com/chapter/10.1007/978-3-030-00934-2_24) (Veeling et al., 2018) | ['eng'] | ImageClassification | i2i | [Medical] | {'test': 32768} | {'test': {'num_samples': 32768, 'unique_num_labels': 2, 'min_image_width': 96, 'average_image_width': 96.0, 'max_image_width': 96, 'min_image_height': 96, 'average_image_height': 96.0, 'max_image_height': 96, 'labels': {'0': {'count': 16391}, '1': {'count': 16377}}}} | | [PatchCamelyonZeroShot](https://link.springer.com/chapter/10.1007/978-3-030-00934-2_24) (Veeling et al., 2018) | ['eng'] | ZeroShotClassification | i2t | [Medical] | {'test': 32768} | {'test': {'num_samples': 32768, 'unique_num_labels': 2, 'min_image_width': 96, 'average_image_width': 96.0, 'max_image_width': 96, 'min_image_height': 96, 'average_image_height': 96.0, 'max_image_height': 96, 'min_label_text_length': 35, 'average_label_text_length': 52.0, 'max_label_text_length': 69, 'labels': {'0': {'count': 16391}, '1': {'count': 16377}}}} | -| [PatentClassification](https://aclanthology.org/P19-1212.pdf) (Sharma et al., 2019) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [PatentClassification.v2](https://aclanthology.org/P19-1212.pdf) (Sharma et al., 2019) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [PawsXPairClassification](https://arxiv.org/abs/1908.11828) (Yinfei Yang, 2019) | ['cmn', 'deu', 'eng', 'fra', 'jpn', 'kor', 'spa'] | PairClassification | s2s | [Encyclopaedic, Web, Written] | {'test': 14000, 'validation': 14000} | {'test': {'num_samples': 14000, 'number_of_characters': 2551922, 'min_sentence1_length': 2, 'avg_sentence1_length': 91.18, 'max_sentence1_length': 268, 'unique_sentence1': 13404, 'min_sentence2_length': 2, 'avg_sentence2_length': 91.1, 'max_sentence2_length': 247, 'unique_sentence2': 13462, 'unique_labels': 2, 'labels': {'1': {'count': 6285}, '0': {'count': 7715}}, 'hf_subset_descriptive_stats': {'de': {'num_samples': 2000, 'number_of_characters': 478034, 'min_sentence1_length': 2, 'avg_sentence1_length': 119.78, 'max_sentence1_length': 268, 'unique_sentence1': 1934, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.24, 'max_sentence2_length': 235, 'unique_sentence2': 1938, 'unique_labels': 2, 'labels': {'1': {'count': 895}, '0': {'count': 1105}}}, 'en': {'num_samples': 2000, 'number_of_characters': 454362, 'min_sentence1_length': 25, 'avg_sentence1_length': 113.76, 'max_sentence1_length': 209, 'unique_sentence1': 1761, 'min_sentence2_length': 25, 'avg_sentence2_length': 113.42, 'max_sentence2_length': 209, 'unique_sentence2': 1800, 'unique_labels': 2, 'labels': {'1': {'count': 907}, '0': {'count': 1093}}}, 'es': {'num_samples': 2000, 'number_of_characters': 471226, 'min_sentence1_length': 2, 'avg_sentence1_length': 117.81, 'max_sentence1_length': 226, 'unique_sentence1': 1955, 'min_sentence2_length': 22, 'avg_sentence2_length': 117.8, 'max_sentence2_length': 233, 'unique_sentence2': 1959, 'unique_labels': 2, 'labels': {'1': {'count': 907}, '0': {'count': 1093}}}, 'fr': {'num_samples': 2000, 'number_of_characters': 480033, 'min_sentence1_length': 2, 'avg_sentence1_length': 120.03, 'max_sentence1_length': 238, 'unique_sentence1': 1954, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.99, 'max_sentence2_length': 247, 'unique_sentence2': 1953, 'unique_labels': 2, 'labels': {'1': {'count': 903}, '0': {'count': 1097}}}, 'ja': {'num_samples': 2000, 'number_of_characters': 235106, 'min_sentence1_length': 2, 'avg_sentence1_length': 58.68, 'max_sentence1_length': 192, 'unique_sentence1': 1944, 'min_sentence2_length': 2, 'avg_sentence2_length': 58.88, 'max_sentence2_length': 198, 'unique_sentence2': 1941, 'unique_labels': 2, 'labels': {'1': {'count': 883}, '0': {'count': 1117}}}, 'ko': {'num_samples': 2000, 'number_of_characters': 260149, 'min_sentence1_length': 2, 'avg_sentence1_length': 64.96, 'max_sentence1_length': 153, 'unique_sentence1': 1954, 'min_sentence2_length': 2, 'avg_sentence2_length': 65.11, 'max_sentence2_length': 159, 'unique_sentence2': 1969, 'unique_labels': 2, 'labels': {'1': {'count': 896}, '0': {'count': 1104}}}, 'zh': {'num_samples': 2000, 'number_of_characters': 173012, 'min_sentence1_length': 2, 'avg_sentence1_length': 43.23, 'max_sentence1_length': 120, 'unique_sentence1': 1909, 'min_sentence2_length': 2, 'avg_sentence2_length': 43.27, 'max_sentence2_length': 113, 'unique_sentence2': 1909, 'unique_labels': 2, 'labels': {'1': {'count': 894}, '0': {'count': 1106}}}}}, 'validation': {'num_samples': 14000, 'number_of_characters': 2524625, 'min_sentence1_length': 2, 'avg_sentence1_length': 90.13, 'max_sentence1_length': 248, 'unique_sentence1': 13357, 'min_sentence2_length': 2, 'avg_sentence2_length': 90.2, 'max_sentence2_length': 275, 'unique_sentence2': 13397, 'unique_labels': 2, 'labels': {'1': {'count': 5948}, '0': {'count': 8052}}, 'hf_subset_descriptive_stats': {'de': {'num_samples': 2000, 'number_of_characters': 467643, 'min_sentence1_length': 2, 'avg_sentence1_length': 116.82, 'max_sentence1_length': 248, 'unique_sentence1': 1914, 'min_sentence2_length': 2, 'avg_sentence2_length': 117.0, 'max_sentence2_length': 275, 'unique_sentence2': 1920, 'unique_labels': 2, 'labels': {'1': {'count': 831}, '0': {'count': 1169}}}, 'en': {'num_samples': 2000, 'number_of_characters': 451931, 'min_sentence1_length': 25, 'avg_sentence1_length': 113.11, 'max_sentence1_length': 213, 'unique_sentence1': 1758, 'min_sentence2_length': 25, 'avg_sentence2_length': 112.86, 'max_sentence2_length': 213, 'unique_sentence2': 1771, 'unique_labels': 2, 'labels': {'1': {'count': 863}, '0': {'count': 1137}}}, 'es': {'num_samples': 2000, 'number_of_characters': 466112, 'min_sentence1_length': 2, 'avg_sentence1_length': 116.33, 'max_sentence1_length': 240, 'unique_sentence1': 1938, 'min_sentence2_length': 2, 'avg_sentence2_length': 116.73, 'max_sentence2_length': 241, 'unique_sentence2': 1941, 'unique_labels': 2, 'labels': {'1': {'count': 847}, '0': {'count': 1153}}}, 'fr': {'num_samples': 2000, 'number_of_characters': 478510, 'min_sentence1_length': 2, 'avg_sentence1_length': 119.5, 'max_sentence1_length': 233, 'unique_sentence1': 1933, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.75, 'max_sentence2_length': 246, 'unique_sentence2': 1939, 'unique_labels': 2, 'labels': {'1': {'count': 860}, '0': {'count': 1140}}}, 'ja': {'num_samples': 2000, 'number_of_characters': 229655, 'min_sentence1_length': 2, 'avg_sentence1_length': 57.51, 'max_sentence1_length': 126, 'unique_sentence1': 1957, 'min_sentence2_length': 2, 'avg_sentence2_length': 57.32, 'max_sentence2_length': 121, 'unique_sentence2': 1969, 'unique_labels': 2, 'labels': {'1': {'count': 854}, '0': {'count': 1146}}}, 'ko': {'num_samples': 2000, 'number_of_characters': 261355, 'min_sentence1_length': 2, 'avg_sentence1_length': 65.16, 'max_sentence1_length': 178, 'unique_sentence1': 1963, 'min_sentence2_length': 2, 'avg_sentence2_length': 65.52, 'max_sentence2_length': 174, 'unique_sentence2': 1968, 'unique_labels': 2, 'labels': {'1': {'count': 840}, '0': {'count': 1160}}}, 'zh': {'num_samples': 2000, 'number_of_characters': 169419, 'min_sentence1_length': 2, 'avg_sentence1_length': 42.45, 'max_sentence1_length': 101, 'unique_sentence1': 1899, 'min_sentence2_length': 2, 'avg_sentence2_length': 42.26, 'max_sentence2_length': 120, 'unique_sentence2': 1895, 'unique_labels': 2, 'labels': {'1': {'count': 853}, '0': {'count': 1147}}}}}} | | [PersianFoodSentimentClassification](https://hooshvare.github.io/docs/datasets/sa) (Mehrdad Farahani et al., 2020) | ['fas'] | Classification | s2s | [Reviews, Written] | None | None | -| [PersianTextEmotion](https://huggingface.co/datasets/SeyedAli/Persian-Text-Emotion) | ['fas'] | Classification | s2s | | None | None | +| [PersianTextEmotion.v2](https://huggingface.co/datasets/SeyedAli/Persian-Text-Emotion) | ['fas'] | Classification | s2s | | None | None | | [PersianWebDocumentRetrieval](https://ieeexplore.ieee.org/document/10553090) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [PersonalJurisdictionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [PhincBitextMining](https://huggingface.co/datasets/veezbo/phinc) (Srivastava et al., 2020) | ['eng', 'hin'] | BitextMining | s2s | [Social, Written] | {'train': 13738} | {'train': {'num_samples': 13738, 'number_of_characters': 2069457, 'unique_pairs': 13737, 'min_sentence1_length': 1, 'average_sentence1_length': 74.02, 'max_sentence1_length': 278, 'unique_sentence1': 13515, 'min_sentence2_length': 3, 'average_sentence2_length': 76.61, 'max_sentence2_length': 274, 'unique_sentence2': 13736, 'hf_subset_descriptive_stats': {'eng-eng_hin': {'num_samples': 13738, 'number_of_characters': 2069457, 'unique_pairs': 13737, 'min_sentence1_length': 1, 'average_sentence1_length': 74.02, 'max_sentence1_length': 278, 'unique_sentence1': 13515, 'min_sentence2_length': 3, 'average_sentence2_length': 76.61, 'max_sentence2_length': 274, 'unique_sentence2': 13736}}}} | | [PlscClusteringP2P.v2](https://huggingface.co/datasets/rafalposwiata/plsc) | ['pol'] | Clustering | s2s | [Academic, Written] | None | None | | [PlscClusteringS2S.v2](https://huggingface.co/datasets/rafalposwiata/plsc) | ['pol'] | Clustering | s2s | [Academic, Written] | None | None | -| [PoemSentimentClassification](https://arxiv.org/abs/2011.02686) (Emily Sheng, 2020) | ['eng'] | Classification | s2s | [Reviews, Written] | None | None | -| [PolEmo2.0-IN](https://aclanthology.org/K19-1092.pdf) (Koco{\'n, 2019) | ['pol'] | Classification | s2s | [Social, Written] | None | None | -| [PolEmo2.0-OUT](https://aclanthology.org/K19-1092.pdf) | ['pol'] | Classification | s2s | [Social, Written] | None | None | +| [PoemSentimentClassification.v2](https://arxiv.org/abs/2011.02686) (Emily Sheng, 2020) | ['eng'] | Classification | s2s | [Reviews, Written] | None | None | +| [PolEmo2.0-IN.v2](https://aclanthology.org/K19-1092.pdf) (Koco{\'n, 2019) | ['pol'] | Classification | s2s | [Social, Written] | None | None | +| [PolEmo2.0-OUT.v2](https://aclanthology.org/K19-1092.pdf) | ['pol'] | Classification | s2s | [Social, Written] | None | None | | [PpcPC](https://arxiv.org/pdf/2207.12759.pdf) (Sławomir Dadas, 2022) | ['pol'] | PairClassification | s2s | [Fiction, News, Non-fiction, Social, Spoken, Web, Written] | None | None | | [PubChemAISentenceParaphrasePC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | s2s | [Chemistry] | None | None | | [PubChemSMILESBitextMining](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | BitextMining | s2s | [Chemistry] | None | None | @@ -605,6 +693,7 @@ The following tables give you an overview of the tasks in MTEB. | [Quora-NL](https://huggingface.co/datasets/clips/beir-nl-quora) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2s | [Written] | None | None | | [Quora-PL](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2s | | None | None | | [Quora-PLHardNegatives](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2s | | None | None | +| [Quora-VN](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Loc Pham, 2025) | ['vie'] | Retrieval | s2s | [Blog, Web, Written] | None | None | | [QuoraRetrieval](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | [Blog, Web, Written] | None | None | | [QuoraRetrieval-Fa](https://huggingface.co/datasets/MCINext/quora-fa) | ['fas'] | Retrieval | s2s | [Web] | None | None | | [QuoraRetrievalHardNegatives](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | | None | None | @@ -630,7 +719,9 @@ The following tables give you an overview of the tasks in MTEB. | [RTE3](https://aclanthology.org/W07-1401/) (Giampiccolo et al., 2007) | ['deu', 'eng', 'fra', 'ita'] | PairClassification | s2s | [Encyclopaedic, News, Web, Written] | None | None | | [RUParaPhraserSTS](https://aclanthology.org/2020.ngt-1.6) (Gudkov et al., 2020) | ['rus'] | STS | s2s | [News, Written] | None | None | | [ReMuQIT2TRetrieval](https://github.com/luomancs/ReMuQ) (Luo et al., 2023) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | {'test': 142403} | {'test': {'number_of_characters': 29161615, 'num_samples': 142403, 'num_queries': 3609, 'num_documents': 138794, 'min_document_length': 9, 'average_document_length': 208.19, 'max_document_length': 508, 'unique_documents': 138794, 'num_document_images': 0, 'min_query_length': 18, 'average_query_length': 73.86, 'max_query_length': 218, 'unique_queries': 3608, 'num_query_images': 3609, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3607}} | +| [RedditClustering-VN](https://arxiv.org/abs/2104.07081) (Loc Pham, 2025) | ['vie'] | Clustering | s2s | [Social, Web, Written] | None | None | | [RedditClustering.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle and Nils Reimers and Andreas R{\"u, 2021) | ['eng'] | Clustering | s2s | [Social, Web, Written] | None | None | +| [RedditClusteringP2P-VN](https://arxiv.org/abs/2104.07081) (Loc Pham, 2025) | ['vie'] | Clustering | p2p | [Social, Web, Written] | None | None | | [RedditClusteringP2P.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle and Nils Reimers and Andreas R{\"u, 2021) | ['eng'] | Clustering | p2p | [Social, Web, Written] | {'test': 459389} | {'test': {'num_samples': 459389, 'number_of_characters': 334286895, 'min_text_length': 79, 'average_text_length': 727.68, 'max_text_length': 4359, 'min_labels_per_text': 2, 'average_labels_per_text': 1.0, 'max_labels_per_text': 77908, 'unique_labels': 440, 'labels': {'FortNiteBR': {'count': 436}, 'buildapc': {'count': 8484}, 'offmychest': {'count': 570}, 'nus': {'count': 45}, 'relationship_advice': {'count': 16651}, 'premed': {'count': 201}, 'dogecoin': {'count': 8108}, 'GamingLaptops': {'count': 183}, 'asktransgender': {'count': 326}, 'MachineLearning': {'count': 61}, 'puppy101': {'count': 1597}, 'GunAccessoriesForSale': {'count': 2619}, 'Random_Acts_Of_Amazon': {'count': 1115}, 'Catholicism': {'count': 183}, 'MonsterHunter': {'count': 218}, 'tipofmypenis': {'count': 87}, 'samsung': {'count': 69}, 'PersonalFinanceCanada': {'count': 341}, 'Dyson_Sphere_Program': {'count': 55}, 'bleach': {'count': 41}, 'AmItheAsshole': {'count': 3730}, 'WallStreetbetsELITE': {'count': 328}, 'GlobalPowers': {'count': 35}, 'ABraThatFits': {'count': 159}, 'PokemonGoFriends': {'count': 1165}, 'NoMansSkyTheGame': {'count': 259}, 'masseffect': {'count': 233}, 'dating_advice': {'count': 559}, 'yoga': {'count': 50}, 'depression': {'count': 515}, 'COVID19positive': {'count': 180}, 'generationology': {'count': 37}, 'feedthebeast': {'count': 192}, 'EliteDangerous': {'count': 270}, 'alcoholicsanonymous': {'count': 93}, 'GoRVing': {'count': 35}, 'thedivision': {'count': 111}, 'breakingmom': {'count': 105}, 'AskAnAmerican': {'count': 80}, 'HypnoFair': {'count': 5}, 'JustUnsubbed': {'count': 13}, 'socialanxiety': {'count': 123}, 'dirtykikpals': {'count': 202}, 'askTO': {'count': 126}, 'AskCulinary': {'count': 108}, 'Bogleheads': {'count': 71}, 'dragonquest': {'count': 45}, 'NoContract': {'count': 30}, 'gorillaz': {'count': 14}, 'MondoGore': {'count': 8}, 'comicswap': {'count': 56}, 'VirtualYoutubers': {'count': 92}, 'Gta5Modding': {'count': 28}, 'obs': {'count': 61}, 'vcu': {'count': 9}, 'KingkillerChronicle': {'count': 17}, 'AmongUs': {'count': 41}, 'wireshark': {'count': 3}, 'Dodocodes': {'count': 46}, 'Aliexpress': {'count': 40}, 'LearnerDriverUK': {'count': 12}, 'PanicAttack': {'count': 23}, 'KassadinMains': {'count': 10}, 'islam': {'count': 93}, 'chronotrigger': {'count': 4}, 'skincareexchange': {'count': 13}, 'PokemonHome': {'count': 21}, 'survivinginfidelity': {'count': 71}, 'igcse': {'count': 21}, 'C25K': {'count': 21}, 'aorus': {'count': 2}, 'idleon': {'count': 19}, 'photography': {'count': 22}, 'cryptocoins': {'count': 7}, 'CanaryWharfBets': {'count': 7}, 'KillingEve': {'count': 7}, 'GameBuilderGarage': {'count': 16}, 'SauceSharingCommunity': {'count': 7}, 'turo': {'count': 9}, 'foodscience': {'count': 14}, 'HIMYM': {'count': 20}, 'HauntingOfHillHouse': {'count': 4}, 'GoodNotes': {'count': 8}, 'RedditWritesSeinfeld': {'count': 6}, 'AirReps': {'count': 2}, 'ADHD': {'count': 3811}, 'BuddyCrossing': {'count': 446}, 'libraryofruina': {'count': 98}, 'SluttyConfessions': {'count': 2787}, 'tipofmytongue': {'count': 7145}, 'fleshlight': {'count': 128}, 'amcstock': {'count': 13910}, 'teenagers': {'count': 77908}, 'suggestmeabook': {'count': 1540}, 'dirtypenpals': {'count': 5587}, 'MinecraftServer': {'count': 177}, 'CreditCards': {'count': 669}, 'Guitar': {'count': 10952}, 'rpg': {'count': 529}, 'NoFap': {'count': 14853}, 'lfg': {'count': 1093}, 'MarsWallStreet': {'count': 935}, 'SummonSign': {'count': 931}, 'AssassinsCreedValhala': {'count': 295}, 'hoi4': {'count': 432}, 'Coins4Sale': {'count': 260}, 'xbox': {'count': 459}, 'TooAfraidToAsk': {'count': 7404}, 'NBA2k': {'count': 553}, 'KGBTR': {'count': 943}, 'roblox': {'count': 220}, 'salesforce': {'count': 214}, 'TwoXChromosomes': {'count': 1736}, 'mechmarket': {'count': 4863}, 'Gaming_Headsets': {'count': 103}, 'pittsburgh': {'count': 189}, 'CryptoMars': {'count': 1606}, 'FridayNightFunkin': {'count': 378}, 'vaginismus': {'count': 122}, 'transpositive': {'count': 10}, 'comicbooks': {'count': 274}, 'BDSMcommunity': {'count': 185}, 'aliens': {'count': 201}, 'Scotch': {'count': 64}, 'KikRoleplay': {'count': 141}, 'Kayaking': {'count': 91}, '196': {'count': 47}, 'digimon': {'count': 140}, 'Evernote': {'count': 42}, 'logh': {'count': 22}, 'arlington': {'count': 15}, 'Adopted': {'count': 8}, 'DissonautUniverse': {'count': 4}, 'Midsommar': {'count': 12}, 'SofiawithanF': {'count': 83}, 'xmpp': {'count': 6}, 'ZombsRoyale': {'count': 16}, 'accesscontrol': {'count': 8}, 'WetlanderHumor': {'count': 2}, 'PoonamPandeyFanatics': {'count': 2}, 'screenplaychallenge': {'count': 2}, 'scatstories': {'count': 2}, 'techsupport': {'count': 290}, 'whatcarshouldIbuy': {'count': 79}, 'Stormlight_Archive': {'count': 15}, 'deadbydaylight': {'count': 126}, 'bicycling': {'count': 27}, 'oculus': {'count': 64}, 'Cartalk': {'count': 33}, 'Sims4': {'count': 43}, 'NoFeeAC': {'count': 95}, 'Crypto_com': {'count': 37}, 'ITCareerQuestions': {'count': 259}, 'aromantic': {'count': 18}, 'Revu': {'count': 3}, 'exalted': {'count': 2}, 'HilariaBaldwin': {'count': 20}, 'Testosterone': {'count': 35}, 'Screenwriting': {'count': 170}, 'LifeProTips': {'count': 49}, 'steinsgate': {'count': 13}, 'Baystreetbets': {'count': 10}, 'AskGirls': {'count': 7}, 'idlechampions': {'count': 7}, 'facebook': {'count': 17}, 'tf2trade': {'count': 4}, 'mfdoom': {'count': 3}, 'FiddlesticksMains': {'count': 2}, 'HFY': {'count': 10}, 'FiestaST': {'count': 2}, 'whatsthatbook': {'count': 994}, 'GearsOfWar': {'count': 879}, 'KazuhaMains': {'count': 175}, 'RepTime': {'count': 211}, 'AstroGaming': {'count': 141}, 'metalgearsolid': {'count': 152}, 'qBittorrent': {'count': 39}, 'ELLIPAL_Official': {'count': 24}, 'raisedbynarcissists': {'count': 4895}, 'unpopularopinion': {'count': 14901}, 'ACTrade': {'count': 5679}, 'askcarsales': {'count': 1339}, 'AskVet': {'count': 1357}, 'whowouldwin': {'count': 4493}, 'playstation': {'count': 1362}, 'anime': {'count': 6531}, 'GME': {'count': 12577}, 'DotA2': {'count': 2004}, 'cryptostreetbets': {'count': 2241}, 'MonsterHunterWorld': {'count': 698}, 'Market76': {'count': 14274}, 'DnD': {'count': 5092}, 'leagueoflegends': {'count': 3683}, 'doordash_drivers': {'count': 1626}, 'theta_network': {'count': 489}, 'exmuslim': {'count': 1369}, 'gonewildaudio': {'count': 2998}, 'conspiracy': {'count': 3587}, 'heroesofthestorm': {'count': 535}, 'FanFiction': {'count': 2782}, 'Doom': {'count': 1251}, 'texas': {'count': 269}, 'Vent': {'count': 1738}, 'selfimprovement': {'count': 1284}, 'youtubers': {'count': 706}, 'askseddit': {'count': 237}, 'boardgames': {'count': 1237}, 'bravelydefault': {'count': 347}, 'ConquerorsBlade': {'count': 238}, 'ChronicPain': {'count': 527}, 'teenagersnew': {'count': 256}, 'brasil': {'count': 1092}, 'MatthiasSubmissions': {'count': 921}, 'MarylandUnemployment': {'count': 314}, 'SaltLakeCity': {'count': 411}, 'BokunoheroFanfiction': {'count': 155}, 'BenignExistence': {'count': 125}, 'GayYoungOldDating': {'count': 156}, 'Bible': {'count': 202}, 'haskell': {'count': 154}, 'seduction': {'count': 400}, 'fantasywriters': {'count': 262}, 'HiveOS': {'count': 100}, 'PerkByDaylight': {'count': 15}, 'Hedgehog': {'count': 73}, 'xmen': {'count': 263}, 'HyperRP': {'count': 122}, 'emotestories': {'count': 3}, 'tutanota': {'count': 135}, 'CultoftheFranklin': {'count': 46}, 'langrisser': {'count': 62}, 'CozyGrove': {'count': 61}, 'Sverigesforsvarsmakt': {'count': 12}, 'silverbugbets': {'count': 21}, 'WreckingBallMains': {'count': 5}, 'capitalism_in_decay': {'count': 8}, 'paintdotnet': {'count': 11}, 'u_mawadom118': {'count': 4}, 'xboxfindfriends': {'count': 2}, 'CPTSD': {'count': 540}, 'destiny2': {'count': 318}, 'Wallstreetsilver': {'count': 1013}, 'DestinyTheGame': {'count': 1107}, 'blackopscoldwar': {'count': 400}, 'InstacartShoppers': {'count': 202}, 'RocketLeagueExchange': {'count': 832}, 'apexlegends': {'count': 3265}, 'kansascity': {'count': 53}, 'namenerds': {'count': 235}, 'help': {'count': 152}, 'Kengan_Ashura': {'count': 132}, 'thetagang': {'count': 165}, 'GameSale': {'count': 262}, 'Reduction': {'count': 109}, 'sex': {'count': 906}, 'bostonr4r': {'count': 75}, 'LegendsOfRuneterra': {'count': 231}, 'overlord': {'count': 48}, 'madisonwi': {'count': 53}, 'steelseries': {'count': 79}, 'ClashOfClansRecruit': {'count': 214}, 'CharacterRant': {'count': 55}, 'AirForce': {'count': 94}, 'sexstories': {'count': 92}, 'NameThatSong': {'count': 162}, 'depressed': {'count': 74}, 'ibs': {'count': 150}, '40kLore': {'count': 269}, 'podcasts': {'count': 88}, 'miraculousladybug': {'count': 150}, 'ask': {'count': 224}, 'EverMerge': {'count': 31}, 'TMJ': {'count': 54}, 'BitLifeApp': {'count': 39}, 'FireEmblemHeroes': {'count': 100}, 'software': {'count': 62}, 'ShieldAndroidTV': {'count': 70}, 'GriefSupport': {'count': 125}, 'onewheel': {'count': 37}, 'MensRights': {'count': 80}, 'nhl': {'count': 22}, 'ClashOfClans': {'count': 107}, 'ps3homebrew': {'count': 33}, 'LightNovels': {'count': 77}, 'redsox': {'count': 34}, 'CryptoMarkets': {'count': 44}, 'ugly': {'count': 47}, 'GCXRep': {'count': 12}, 'cscareerquestionsEU': {'count': 65}, 'MindHunter': {'count': 6}, 'starcraft2coop': {'count': 15}, 'nanocurrency': {'count': 1421}, 'ModelCars': {'count': 8}, 'UKJobs': {'count': 30}, 'Netherlands': {'count': 44}, 'clonewars': {'count': 8}, 'Julia': {'count': 11}, 'Prolactinoma': {'count': 9}, 'sofi': {'count': 11}, 'royalfamily': {'count': 6}, 'ConnecticutR4R': {'count': 8}, 'weather': {'count': 5}, 'oneui': {'count': 7}, 'KTM': {'count': 5}, 'Aerials': {'count': 3}, 'seoul': {'count': 2}, 'exjw': {'count': 3281}, 'ModernMagic': {'count': 699}, 'Paladins': {'count': 1242}, 'kdramarecommends': {'count': 1611}, 'hitbtc': {'count': 330}, 'endocrinology': {'count': 75}, 'Bath': {'count': 43}, 'NassauCountyHookups': {'count': 5}, 'feminineboys': {'count': 1248}, 'dreamsmp': {'count': 2018}, 'SquaredCircle': {'count': 2255}, 'Minecraft': {'count': 8753}, 'spirituality': {'count': 1809}, 'Eldenring': {'count': 1471}, 'Sat': {'count': 1172}, 'bonnaroo': {'count': 194}, 'gardening': {'count': 1892}, 'Unemployment': {'count': 6185}, 'mac': {'count': 1847}, 'Bestbuy': {'count': 437}, 'quittingkratom': {'count': 1081}, 'lawschooladmissions': {'count': 3436}, 'NiceHash': {'count': 2135}, 'McMaster': {'count': 815}, 'covidlonghaulers': {'count': 1299}, 'stalker': {'count': 758}, 'MLBTheShow': {'count': 2721}, 'FortniteCompetitive': {'count': 998}, 'dpdr': {'count': 514}, 'appliancerepair': {'count': 720}, 'thomasthetankengine': {'count': 207}, 'delhi': {'count': 217}, 'Huel': {'count': 300}, 'leafs': {'count': 203}, 'HotWheels': {'count': 170}, '90dayfianceuncensored': {'count': 550}, 'Throwers': {'count': 142}, 'Wavyhair': {'count': 270}, 'CryptoHorde': {'count': 128}, 'ShuumatsuNoValkyrie': {'count': 453}, 'TeensMeetTeens': {'count': 432}, 'dbrand': {'count': 108}, 'SLFmeetups': {'count': 18}, '1200isplentyketo': {'count': 48}, 'passive_income': {'count': 211}, 'BroadCity': {'count': 16}, 'RevenantMain': {'count': 71}, 'extrarfl': {'count': 25}, 'AgonGame': {'count': 5}, 'FitnessDE': {'count': 3}, 'gaming': {'count': 1277}, 'livesound': {'count': 91}, 'IBO': {'count': 1896}, 'EscapefromTarkov': {'count': 1300}, 'amex': {'count': 145}, 'DMAcademy': {'count': 1411}, 'VinylCollectors': {'count': 556}, 'cardano': {'count': 716}, 'brave_browser': {'count': 159}, 'dating': {'count': 952}, 'OculusQuest': {'count': 942}, 'Superstonk': {'count': 3089}, 'MtF': {'count': 957}, 'findaleague': {'count': 207}, 'Nioh': {'count': 398}, 'IRS': {'count': 715}, 'transgendercirclejerk': {'count': 353}, 'learnmath': {'count': 489}, 'piano': {'count': 263}, 'LeagueConnect': {'count': 216}, 'eu4': {'count': 561}, 'Wordpress': {'count': 345}, 'RoleplayingForReddit': {'count': 31}, 'LOONA': {'count': 89}, 'newtothenavy': {'count': 167}, 'HaircareScience': {'count': 118}, 'appletv': {'count': 167}, 'sissypersonals': {'count': 102}, 'raleigh': {'count': 168}, 'realonlyfansreviews': {'count': 21}, 'AskGames': {'count': 49}, 'PokemonTCG': {'count': 325}, 'controlgame': {'count': 109}, 'GoogleDataStudio': {'count': 16}, 'WhiteWolfRPG': {'count': 139}, 'MECoOp': {'count': 31}, 'snuffrp': {'count': 46}, 'lockpicking': {'count': 103}, 'wicked_edge': {'count': 105}, 'BMW': {'count': 99}, 'choiceofgames': {'count': 24}, 'hisdarkmaterials': {'count': 12}, 'SakuraGakuin': {'count': 24}, 'detrans': {'count': 55}, 'Smallville': {'count': 37}, 'kingofqueens': {'count': 7}, 'JamesHoffmann': {'count': 22}, 'stashinvest': {'count': 16}, 'ABA': {'count': 79}, 'ladybusiness': {'count': 10}, 'gamegrumps': {'count': 32}, 'GodEater': {'count': 21}, 'tomorrow': {'count': 39}, 'Tomorrowland': {'count': 9}, 'BlackCountryNewRoad': {'count': 5}, 'STAYC': {'count': 3}, 'SatoshiStreetBets': {'count': 3828}, 'AskLosAngeles': {'count': 1036}, 'buildapcforme': {'count': 1689}, 'ApplyingToCollege': {'count': 10675}, 'watercooling': {'count': 1209}, 'BreakUps': {'count': 4914}, 'FIFA': {'count': 3811}, 'emacs': {'count': 712}, 'trakstocks': {'count': 691}, 'Shittyaskflying': {'count': 147}, 'AmazonFC': {'count': 1178}, 'stocks': {'count': 4610}, 'BangaloreMains': {'count': 26}, 'pokemon': {'count': 3953}, 'religion': {'count': 684}, 'cuboulder': {'count': 269}, 'self': {'count': 1688}, 'tarot': {'count': 912}, 'turtles': {'count': 49}, 'TheMagnusArchives': {'count': 300}, 'Superhero_Ideas': {'count': 34}, 'NTU': {'count': 308}, 'touhou': {'count': 623}, 'JoJolion': {'count': 50}, 'lasers': {'count': 27}, 'popperpigs': {'count': 67}, 'aggretsuko': {'count': 20}, 'Library': {'count': 5}}}} | | [RenderedSST2](https://huggingface.co/datasets/clip-benchmark/wds_renderedsst2) | ['eng'] | ZeroShotClassification | i2t | [Reviews] | {'test': 1821} | {'test': {'num_samples': 1821, 'unique_num_labels': 2, 'min_image_width': 448, 'average_image_width': 448.0, 'max_image_width': 448, 'min_image_height': 448, 'average_image_height': 448.0, 'max_image_height': 448, 'min_label_text_length': 28, 'average_label_text_length': 28.0, 'max_label_text_length': 28, 'labels': {'0': {'count': 912}, '1': {'count': 909}}}} | | [RestaurantReviewSentimentClassification.v2](https://link.springer.com/chapter/10.1007/978-3-319-18117-2_2) (ElSahar et al., 2015) | ['ara'] | Classification | s2s | [Reviews, Written] | None | None | @@ -639,19 +730,26 @@ The following tables give you an overview of the tasks in MTEB. | [Robust04InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionRetrieval | s2p | [News, Written] | None | None | | [RomaTalesBitextMining](https://idoc.pub/documents/idocpub-zpnxm9g35ylv) | ['hun', 'rom'] | BitextMining | s2s | [Fiction, Written] | None | None | | [RomaniBibleClustering](https://romani.global.bible/info) | ['rom'] | Clustering | p2p | [Religious, Written] | None | None | -| [RomanianReviewsSentiment](https://arxiv.org/abs/2101.04197) (Anca Maria Tache, 2021) | ['ron'] | Classification | s2s | [Reviews, Written] | None | None | -| [RomanianSentimentClassification](https://arxiv.org/abs/2009.08712) (Dumitrescu et al., 2020) | ['ron'] | Classification | s2s | [Reviews, Written] | None | None | +| [RomanianReviewsSentiment.v2](https://arxiv.org/abs/2101.04197) (Anca Maria Tache, 2021) | ['ron'] | Classification | s2s | [Reviews, Written] | None | None | +| [RomanianSentimentClassification.v2](https://arxiv.org/abs/2009.08712) (Dumitrescu et al., 2020) | ['ron'] | Classification | s2s | [Reviews, Written] | None | None | | [RonSTS](https://openreview.net/forum?id=JH61CD7afTv) (Dumitrescu et al., 2021) | ['ron'] | STS | s2s | [News, Social, Web, Written] | None | None | | [RuBQReranking](https://openreview.net/pdf?id=P5UQFFoQ4PJ) (Ivan Rybin, 2021) | ['rus'] | Reranking | s2p | [Encyclopaedic, Written] | None | None | | [RuBQRetrieval](https://openreview.net/pdf?id=P5UQFFoQ4PJ) (Ivan Rybin, 2021) | ['rus'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [RuNLUIntentClassification](https://arxiv.org/abs/1903.05566) (Xingkun Liu, 2019) | ['rus'] | Classification | t2t | | None | None | -| [RuReviewsClassification](https://github.com/sismetanin/rureviews) (Sergey Smetanin, 2019) | ['rus'] | Classification | p2p | [Reviews, Written] | None | None | +| [RuReviewsClassification.v2](https://github.com/sismetanin/rureviews) (Sergey Smetanin, 2019) | ['rus'] | Classification | p2p | [Reviews, Written] | None | None | | [RuSTSBenchmarkSTS](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['rus'] | STS | s2s | [News, Social, Web, Written] | None | None | -| [RuSciBenchGRNTIClassification](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Classification | p2p | [Academic, Written] | None | None | +| [RuSciBenchBitextMining](https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb) (Vatolin et al., 2024) | ['eng', 'rus'] | BitextMining | p2p | [Academic, Non-fiction, Written] | None | None | +| [RuSciBenchCiteRetrieval](https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb) (Vatolin et al., 2024) | ['eng', 'rus'] | Retrieval | p2p | [Academic, Non-fiction, Written] | None | None | +| [RuSciBenchCitedCountRegression](https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb) (Vatolin et al., 2024) | ['eng', 'rus'] | Regression | p2p | [Academic, Non-fiction, Written] | None | None | +| [RuSciBenchCociteRetrieval](https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb) (Vatolin et al., 2024) | ['eng', 'rus'] | Retrieval | p2p | [Academic, Non-fiction, Written] | None | None | +| [RuSciBenchCoreRiscClassification](https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb) (Vatolin et al., 2024) | ['eng', 'rus'] | Classification | p2p | [Academic, Non-fiction, Written] | None | None | +| [RuSciBenchGRNTIClassification.v2](https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb) (Vatolin et al., 2024) | ['eng', 'rus'] | Classification | p2p | [Academic, Non-fiction, Written] | None | None | | [RuSciBenchGRNTIClusteringP2P](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Clustering | p2p | [Academic, Written] | {'test': 2048} | {'test': {'num_samples': 2048, 'number_of_characters': 1822339, 'min_text_length': 84, 'average_text_length': 889.81, 'max_text_length': 3143, 'min_labels_per_text': 73, 'average_labels_per_text': 1.0, 'max_labels_per_text': 74, 'unique_labels': 28, 'labels': {'3': {'count': 73}, '4': {'count': 73}, '20': {'count': 73}, '9': {'count': 73}, '21': {'count': 73}, '15': {'count': 73}, '16': {'count': 74}, '2': {'count': 73}, '8': {'count': 73}, '23': {'count': 73}, '6': {'count': 73}, '24': {'count': 73}, '10': {'count': 73}, '1': {'count': 73}, '17': {'count': 74}, '14': {'count': 74}, '18': {'count': 73}, '27': {'count': 73}, '19': {'count': 73}, '22': {'count': 73}, '12': {'count': 73}, '25': {'count': 73}, '5': {'count': 74}, '0': {'count': 73}, '26': {'count': 73}, '11': {'count': 73}, '13': {'count': 73}, '7': {'count': 73}}}} | -| [RuSciBenchOECDClassification](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Classification | p2p | [Academic, Written] | None | None | +| [RuSciBenchOECDClassification.v2](https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb) (Vatolin et al., 2024) | ['eng', 'rus'] | Classification | p2p | [Academic, Non-fiction, Written] | None | None | | [RuSciBenchOECDClusteringP2P](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Clustering | p2p | [Academic, Written] | None | None | -| [RuToxicOKMLCUPClassification](https://cups.online/ru/contests/okmlcup2020) | ['rus'] | Classification | t2t | | None | None | +| [RuSciBenchPubTypeClassification](https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb) (Vatolin et al., 2024) | ['eng', 'rus'] | Classification | p2p | [Academic, Non-fiction, Written] | None | None | +| [RuSciBenchYearPublRegression](https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb) (Vatolin et al., 2024) | ['eng', 'rus'] | Regression | p2p | [Academic, Non-fiction, Written] | None | None | +| [RuToxicOKMLCUPClassification.v2](https://cups.online/ru/contests/okmlcup2020) | ['rus'] | Classification | t2t | | None | None | | [RuToxicOKMLCUPMultilabelClassification](https://cups.online/ru/contests/okmlcup2020) | ['rus'] | Classification | t2t | | None | None | | [SAMSumFa](https://huggingface.co/datasets/MCINext/samsum-fa) | ['fas'] | BitextMining | s2p | [Spoken] | None | None | | [SCDBPAccountabilityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Chilton et al., 2017) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | @@ -668,8 +766,9 @@ The following tables give you an overview of the tasks in MTEB. | [SCIDOCS-Fa](https://huggingface.co/datasets/MCINext/scidocs-fa) | ['fas'] | Retrieval | s2p | [Academic] | None | None | | [SCIDOCS-NL](https://huggingface.co/datasets/clips/beir-nl-scidocs) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | | [SCIDOCS-PL](https://allenai.org/data/scidocs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | -| [SDSEyeProtectionClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2p | [Chemistry] | None | None | -| [SDSGlovesClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2p | [Chemistry] | None | None | +| [SCIDOCS-VN](https://allenai.org/data/scidocs) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | +| [SDSEyeProtectionClassification.v2](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2p | [Chemistry] | None | None | +| [SDSGlovesClassification.v2](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2p | [Chemistry] | None | None | | [SIB200Classification](https://arxiv.org/abs/2309.07445) (Adelani et al., 2023) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nqo', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | Classification | s2s | [News, Written] | None | None | | [SIB200ClusteringS2S](https://arxiv.org/abs/2309.07445) (Adelani et al., 2023) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nqo', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | Clustering | s2s | [News, Written] | None | None | | [SICK-BR-PC](https://linux.ime.usp.br/~thalen/SICK_PT.pdf) (Real et al., 2018) | ['por'] | PairClassification | s2s | [Web, Written] | None | None | @@ -677,8 +776,9 @@ The following tables give you an overview of the tasks in MTEB. | [SICK-E-PL](https://aclanthology.org/2020.lrec-1.207) (Dadas et al., 2020) | ['pol'] | PairClassification | s2s | [Reviews] | None | None | | [SICK-R](https://aclanthology.org/L14-1314/) (Marelli et al., 2014) | ['eng'] | STS | s2s | [Web, Written] | None | None | | [SICK-R-PL](https://aclanthology.org/2020.lrec-1.207) (Dadas et al., 2020) | ['pol'] | STS | s2s | [Web, Written] | None | None | +| [SICK-R-VN](https://aclanthology.org/2020.lrec-1.207) (Loc Pham, 2025) | ['vie'] | STS | s2s | [Web, Written] | None | None | | [SICKFr](https://huggingface.co/datasets/Lajavaness/SICK-fr) | ['fra'] | STS | s2s | | None | None | -| [SIDClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Academic] | None | None | +| [SIDClassification.v2](https://mcinext.com/) | ['fas'] | Classification | p2p | [Academic] | None | None | | [SIDClustring](https://www.sid.com/) | ['fas'] | Clustering | p2p | [Academic] | None | None | | [SIQA](https://leaderboard.allenai.org/socialiqa/submissions/get-started) (Sap et al., 2019) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [SKQuadRetrieval](https://huggingface.co/datasets/TUKE-KEMT/retrieval-skquad) | ['slk'] | Retrieval | s2s | [Encyclopaedic] | None | None | @@ -704,6 +804,7 @@ The following tables give you an overview of the tasks in MTEB. | [STS22.v2](https://competitions.codalab.org/competitions/33835) (Chen et al., 2022) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'pol', 'rus', 'spa', 'tur'] | STS | p2p | [News, Written] | None | None | | [STSB](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None | | [STSBenchmark](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['eng'] | STS | s2s | [Blog, News, Written] | None | None | +| [STSBenchmark-VN](https://github.com/PhilipMay/stsb-multi-mt/) (Loc Pham, 2025) | ['vie'] | STS | s2s | [Blog, News, Written] | None | None | | [STSBenchmarkMultilingualSTS](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['cmn', 'deu', 'eng', 'fra', 'ita', 'nld', 'pol', 'por', 'rus', 'spa'] | STS | s2s | [News, Social, Spoken, Web, Written] | None | None | | [STSBenchmarkMultilingualVisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['cmn', 'deu', 'eng', 'fra', 'ita', 'nld', 'pol', 'por', 'rus', 'spa'] | VisualSTS(multi) | i2i | [News, Social, Spoken, Web, Written] | {'dev': 15000, 'test': 13790} | {'dev': {'num_samples': 15000, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0, 'hf_subset_descriptive_stats': {'en': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'de': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'es': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'fr': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'it': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'nl': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'pl': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'pt': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'ru': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'zh': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}}}, 'test': {'num_samples': 13790, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0, 'hf_subset_descriptive_stats': {'en': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'de': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'es': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'fr': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'it': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'nl': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'pl': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'pt': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'ru': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'zh': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}}}} | | [STSES](https://huggingface.co/datasets/PlanTL-GOB-ES/sts-es) (Agirre et al., 2014) | ['spa'] | STS | s2s | [Written] | None | None | @@ -715,36 +816,42 @@ The following tables give you an overview of the tasks in MTEB. | [ScalaClassification](https://aclanthology.org/2023.nodalida-1.20/) (Nielsen et al., 2023) | ['dan', 'nno', 'nob', 'swe'] | Classification | s2s | [Blog, Fiction, News, Non-fiction, Spoken, Web, Written] | None | None | | [ScandiSentClassification](https://github.com/timpal0l/ScandiSent) (Isbister et al., 2021) | ['dan', 'eng', 'fin', 'nob', 'swe'] | Classification | s2s | [Reviews, Written] | None | None | | [SciDocsRR](https://allenai.org/data/scidocs) (Cohan et al., 2020) | ['eng'] | Reranking | s2s | [Academic, Non-fiction, Written] | None | None | +| [SciDocsRR-VN](https://allenai.org/data/scidocs) (Loc Pham, 2025) | ['vie'] | Reranking | s2s | [Academic, Non-fiction, Written] | None | None | | [SciFact](https://github.com/allenai/scifact) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [SciFact-Fa](https://huggingface.co/datasets/MCINext/scifact-fa) | ['fas'] | Retrieval | s2p | [Academic] | None | None | | [SciFact-NL](https://huggingface.co/datasets/clips/beir-nl-scifact) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [SciFact-PL](https://github.com/allenai/scifact) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | +| [SciFact-VN](https://github.com/allenai/scifact) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [SciMMIR](https://huggingface.co/datasets/m-a-p/SciMMIR) (Siwei Wu, 2024) | ['eng'] | ZeroShotClassification | i2t | [Academic] | {'test': 16263} | {'test': {'num_samples': 16263, 'unique_num_labels': 5, 'min_image_width': 44, 'average_image_width': 706.75, 'max_image_width': 1607, 'min_image_height': 37, 'average_image_height': 396.71, 'max_image_height': 1439, 'min_label_text_length': 18, 'average_label_text_length': 22.4, 'max_label_text_length': 27, 'labels': {'0': {'count': 9488}, '4': {'count': 4229}, '3': {'count': 543}, '2': {'count': 467}, '1': {'count': 1536}}}} | | [SciMMIRI2TRetrieval](https://aclanthology.org/2024.findings-acl.746/) (Wu et al., 2024) | ['eng'] | Any2AnyRetrieval | i2t | [Academic] | {'test': 32526} | {'test': {'number_of_characters': 4247786, 'num_samples': 32526, 'num_queries': 16263, 'num_documents': 16263, 'min_document_length': 21, 'average_document_length': 261.19, 'max_document_length': 2184, 'unique_documents': 16263, 'num_document_images': 0, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 16263, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 16263}} | | [SciMMIRT2IRetrieval](https://aclanthology.org/2024.findings-acl.746/) (Wu et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | {'test': 32526} | {'test': {'number_of_characters': 4247786, 'num_samples': 32526, 'num_queries': 16263, 'num_documents': 16263, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 16263, 'min_query_length': 21, 'average_query_length': 261.19, 'max_query_length': 2184, 'unique_queries': 16263, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 16263}} | | [SemRel24STS](https://huggingface.co/datasets/SemRel/SemRel2024) (Nedjma Ousidhoum, 2024) | ['afr', 'amh', 'arb', 'arq', 'ary', 'eng', 'hau', 'hin', 'ind', 'kin', 'mar', 'tel'] | STS | s2s | [Spoken, Written] | None | None | | [SensitiveTopicsClassification](https://aclanthology.org/2021.bsnlp-1.4) (Babakov et al., 2021) | ['rus'] | MultilabelClassification | s2s | [Social, Web, Written] | None | None | -| [SentiRuEval2016](https://github.com/mokoron/sentirueval) (Loukachevitch et al., 2016) | ['rus'] | Classification | t2t | | None | None | -| [SentimentAnalysisHindi](https://huggingface.co/datasets/OdiaGenAI/sentiment_analysis_hindi) (Shantipriya Parida, 2023) | ['hin'] | Classification | s2s | [Reviews, Written] | None | None | -| [SentimentDKSF](https://github.com/hezarai/hezar) | ['fas'] | Classification | s2p | [Reviews] | None | None | -| [SinhalaNewsClassification](https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Category-classification) (Nisansa de Silva, 2015) | ['sin'] | Classification | s2s | [News, Written] | None | None | -| [SinhalaNewsSourceClassification](https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Source-classification) (Dhananjaya et al., 2022) | ['sin'] | Classification | s2s | [News, Written] | None | None | -| [SiswatiNewsClassification](https://huggingface.co/datasets/dsfsi/za-isizulu-siswati-news) (Madodonga et al., 2023) | ['ssw'] | Classification | s2s | [News, Written] | None | None | +| [SentiRuEval2016.v2](https://github.com/mokoron/sentirueval) (Loukachevitch et al., 2016) | ['rus'] | Classification | t2t | | None | None | +| [SentimentAnalysisHindi.v2](https://huggingface.co/datasets/OdiaGenAI/sentiment_analysis_hindi) (Shantipriya Parida, 2023) | ['hin'] | Classification | s2s | [Reviews, Written] | None | None | +| [SentimentDKSF.v2](https://github.com/hezarai/hezar) | ['fas'] | Classification | s2p | [Reviews] | None | None | +| [SinhalaNewsClassification.v2](https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Category-classification) (Nisansa de Silva, 2015) | ['sin'] | Classification | s2s | [News, Written] | None | None | +| [SinhalaNewsSourceClassification.v2](https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Source-classification) (Dhananjaya et al., 2022) | ['sin'] | Classification | s2s | [News, Written] | None | None | +| [SiswatiNewsClassification.v2](https://huggingface.co/datasets/dsfsi/za-isizulu-siswati-news) (Madodonga et al., 2023) | ['ssw'] | Classification | s2s | [News, Written] | None | None | | [SketchyI2IRetrieval](https://arxiv.org/abs/2202.01747) (Ypsilantis et al., 2021) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | {'test': 477886} | {'test': {'number_of_characters': 0, 'num_samples': 477886, 'num_queries': 452886, 'num_documents': 25000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 25000, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 452886, 'min_relevant_docs_per_query': 100, 'average_relevant_docs_per_query': 100.0, 'max_relevant_docs_per_query': 100, 'unique_relevant_docs': 12500}} | -| [SlovakHateSpeechClassification](https://huggingface.co/datasets/TUKE-KEMT/hate_speech_slovak) | ['slk'] | Classification | s2s | [Social, Written] | {'test': 1319, 'train': 11870} | {'test': {'num_samples': 1319, 'number_of_characters': 122279, 'num_texts_in_train': 46, 'min_text_length': 8, 'average_text_length': 92.71, 'max_text_length': 1584, 'unique_text': 1315, 'unique_labels': 2, 'labels': {'1': {'count': 360}, '0': {'count': 959}}}, 'train': {'num_samples': 11870, 'number_of_characters': 1130860, 'num_texts_in_train': None, 'min_text_length': 7, 'average_text_length': 95.27, 'max_text_length': 2112, 'unique_text': 11655, 'unique_labels': 2, 'labels': {'1': {'count': 3245}, '0': {'count': 8625}}}} | -| [SlovakMovieReviewSentimentClassification](https://arxiv.org/pdf/2304.01922) ({\v{S, 2023) | ['svk'] | Classification | s2s | [Reviews, Written] | None | None | +| [SlovakHateSpeechClassification.v2](https://huggingface.co/datasets/TUKE-KEMT/hate_speech_slovak) | ['slk'] | Classification | s2s | [Social, Written] | None | None | +| [SlovakMovieReviewSentimentClassification.v2](https://arxiv.org/pdf/2304.01922) ({\v{S, 2023) | ['svk'] | Classification | s2s | [Reviews, Written] | None | None | | [SlovakSumRetrieval](https://huggingface.co/datasets/NaiveNeuron/slovaksum) | ['slk'] | Retrieval | s2s | [News, Social, Web, Written] | None | None | | [SouthAfricanLangClassification](https://www.kaggle.com/competitions/south-african-language-identification/) (ExploreAI Academy et al., 2022) | ['afr', 'eng', 'nbl', 'nso', 'sot', 'ssw', 'tsn', 'tso', 'ven', 'xho', 'zul'] | Classification | s2s | [Non-fiction, Web, Written] | None | None | -| [SpanishNewsClassification](https://huggingface.co/datasets/MarcOrfilaCarreras/spanish-news) | ['spa'] | Classification | s2s | [News, Written] | None | None | +| [SpanishNewsClassification.v2](https://huggingface.co/datasets/MarcOrfilaCarreras/spanish-news) | ['spa'] | Classification | s2s | [News, Written] | None | None | | [SpanishNewsClusteringP2P](https://www.kaggle.com/datasets/kevinmorgado/spanish-news-classification) | ['spa'] | Clustering | p2p | | None | None | | [SpanishPassageRetrievalS2P](https://mklab.iti.gr/results/spanish-passage-retrieval-dataset/) (Kamateri et al., 2019) | ['spa'] | Retrieval | s2p | | None | None | | [SpanishPassageRetrievalS2S](https://mklab.iti.gr/results/spanish-passage-retrieval-dataset/) (Kamateri et al., 2019) | ['spa'] | Retrieval | s2s | | None | None | -| [SpanishSentimentClassification](https://huggingface.co/datasets/sepidmnorozy/Spanish_sentiment) (Mollanorozy et al., 2023) | ['spa'] | Classification | s2s | [Reviews, Written] | None | None | +| [SpanishSentimentClassification.v2](https://huggingface.co/datasets/sepidmnorozy/Spanish_sentiment) (Mollanorozy et al., 2023) | ['spa'] | Classification | s2s | [Reviews, Written] | None | None | | [SpartQA](https://github.com/HLR/SpartQA_generation) (Mirzaee et al., 2021) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [SprintDuplicateQuestions](https://www.aclweb.org/anthology/D18-1131/) (Shah et al., 2018) | ['eng'] | PairClassification | s2s | [Programming, Written] | None | None | +| [SprintDuplicateQuestions-VN](https://www.aclweb.org/anthology/D18-1131/) (Loc Pham, 2025) | ['vie'] | PairClassification | s2s | [Programming, Written] | None | None | +| [StackExchangeClustering-VN](https://arxiv.org/abs/2104.07081) (Loc Pham, 2025) | ['vie'] | Clustering | s2s | [Web, Written] | None | None | | [StackExchangeClustering.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle and Nils Reimers and Andreas R{\"u, 2021) | ['eng'] | Clustering | s2s | [Web, Written] | None | None | +| [StackExchangeClusteringP2P-VN](https://arxiv.org/abs/2104.07081) (Loc Pham, 2025) | ['vie'] | Clustering | p2p | [Web, Written] | None | None | | [StackExchangeClusteringP2P.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle and Nils Reimers and Andreas R{\"u, 2021) | ['eng'] | Clustering | p2p | [Web, Written] | None | None | | [StackOverflowDupQuestions](https://www.microsoft.com/en-us/research/uploads/prod/2019/03/nl4se18LinkSO.pdf) (Xueqing Liu, 2018) | ['eng'] | Reranking | s2s | [Blog, Programming, Written] | None | None | +| [StackOverflowDupQuestions-VN](https://www.microsoft.com/en-us/research/uploads/prod/2019/03/nl4se18LinkSO.pdf) (Loc Pham, 2025) | ['vie'] | Reranking | s2s | [Academic, Non-fiction, Written] | None | None | | [StackOverflowQA](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 21925} | {'test': {'number_of_characters': 26584028, 'num_samples': 21925, 'num_queries': 1994, 'num_documents': 19931, 'min_document_length': 61, 'average_document_length': 130.32, 'max_document_length': 22234, 'unique_documents': 19931, 'min_query_length': 5, 'average_query_length': 12029.38, 'max_query_length': 46028, 'unique_queries': 1994, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1994}} | | [StanfordCars](https://pure.mpg.de/rest/items/item_2029263/component/file_2029262/content) (Jonathan Krause, 2013) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 8041} | {'test': {'num_samples': 8041, 'unique_num_labels': 196, 'min_image_width': 78, 'average_image_width': 701.18, 'max_image_width': 7800, 'min_image_height': 41, 'average_image_height': 483.75, 'max_image_height': 5400, 'labels': {'180': {'count': 38}, '102': {'count': 39}, '144': {'count': 44}, '186': {'count': 43}, '184': {'count': 38}, '77': {'count': 37}, '117': {'count': 41}, '164': {'count': 44}, '31': {'count': 41}, '59': {'count': 36}, '48': {'count': 37}, '107': {'count': 44}, '115': {'count': 37}, '134': {'count': 42}, '82': {'count': 40}, '50': {'count': 43}, '153': {'count': 42}, '32': {'count': 42}, '21': {'count': 42}, '150': {'count': 43}, '3': {'count': 42}, '80': {'count': 45}, '106': {'count': 44}, '190': {'count': 46}, '169': {'count': 44}, '194': {'count': 43}, '90': {'count': 38}, '4': {'count': 40}, '163': {'count': 43}, '147': {'count': 45}, '187': {'count': 43}, '43': {'count': 44}, '6': {'count': 39}, '30': {'count': 44}, '73': {'count': 43}, '29': {'count': 41}, '165': {'count': 41}, '179': {'count': 42}, '105': {'count': 41}, '2': {'count': 43}, '64': {'count': 45}, '34': {'count': 41}, '74': {'count': 44}, '84': {'count': 43}, '24': {'count': 39}, '167': {'count': 42}, '136': {'count': 43}, '133': {'count': 33}, '155': {'count': 39}, '119': {'count': 42}, '129': {'count': 41}, '127': {'count': 39}, '35': {'count': 41}, '170': {'count': 46}, '36': {'count': 38}, '63': {'count': 29}, '182': {'count': 42}, '42': {'count': 46}, '17': {'count': 42}, '75': {'count': 43}, '0': {'count': 44}, '62': {'count': 44}, '173': {'count': 41}, '16': {'count': 40}, '104': {'count': 43}, '49': {'count': 42}, '122': {'count': 44}, '81': {'count': 45}, '191': {'count': 42}, '92': {'count': 39}, '145': {'count': 43}, '95': {'count': 41}, '54': {'count': 39}, '114': {'count': 45}, '112': {'count': 42}, '151': {'count': 35}, '91': {'count': 40}, '188': {'count': 40}, '20': {'count': 42}, '33': {'count': 44}, '86': {'count': 44}, '128': {'count': 38}, '142': {'count': 40}, '19': {'count': 46}, '177': {'count': 41}, '11': {'count': 36}, '45': {'count': 43}, '60': {'count': 43}, '8': {'count': 41}, '56': {'count': 37}, '28': {'count': 42}, '120': {'count': 44}, '5': {'count': 44}, '85': {'count': 42}, '68': {'count': 38}, '22': {'count': 39}, '108': {'count': 44}, '89': {'count': 41}, '132': {'count': 42}, '125': {'count': 42}, '137': {'count': 39}, '158': {'count': 36}, '58': {'count': 44}, '123': {'count': 39}, '52': {'count': 44}, '27': {'count': 41}, '13': {'count': 42}, '70': {'count': 35}, '25': {'count': 34}, '185': {'count': 38}, '171': {'count': 44}, '9': {'count': 33}, '40': {'count': 35}, '178': {'count': 45}, '44': {'count': 32}, '97': {'count': 46}, '87': {'count': 39}, '159': {'count': 44}, '146': {'count': 44}, '51': {'count': 41}, '121': {'count': 40}, '1': {'count': 32}, '160': {'count': 48}, '78': {'count': 48}, '109': {'count': 43}, '103': {'count': 42}, '174': {'count': 30}, '181': {'count': 46}, '23': {'count': 45}, '111': {'count': 45}, '166': {'count': 47}, '172': {'count': 43}, '66': {'count': 38}, '192': {'count': 41}, '148': {'count': 42}, '72': {'count': 44}, '141': {'count': 32}, '71': {'count': 45}, '7': {'count': 45}, '152': {'count': 44}, '183': {'count': 40}, '98': {'count': 27}, '94': {'count': 45}, '126': {'count': 41}, '100': {'count': 42}, '131': {'count': 43}, '116': {'count': 42}, '39': {'count': 39}, '149': {'count': 36}, '101': {'count': 39}, '139': {'count': 42}, '69': {'count': 42}, '12': {'count': 41}, '14': {'count': 43}, '96': {'count': 42}, '41': {'count': 34}, '189': {'count': 43}, '10': {'count': 38}, '140': {'count': 34}, '26': {'count': 35}, '57': {'count': 44}, '88': {'count': 44}, '67': {'count': 40}, '93': {'count': 43}, '193': {'count': 45}, '161': {'count': 45}, '118': {'count': 68}, '110': {'count': 42}, '154': {'count': 42}, '138': {'count': 42}, '143': {'count': 46}, '61': {'count': 37}, '176': {'count': 44}, '113': {'count': 45}, '18': {'count': 40}, '53': {'count': 40}, '47': {'count': 42}, '157': {'count': 29}, '168': {'count': 38}, '124': {'count': 43}, '79': {'count': 43}, '130': {'count': 42}, '46': {'count': 35}, '55': {'count': 46}, '195': {'count': 40}, '38': {'count': 36}, '37': {'count': 40}, '99': {'count': 33}, '83': {'count': 42}, '162': {'count': 36}, '135': {'count': 24}, '175': {'count': 38}, '156': {'count': 36}, '15': {'count': 43}, '65': {'count': 41}, '76': {'count': 40}}}} | | [StanfordCarsI2IRetrieval](https://pure.mpg.de/rest/items/item_2029263/component/file_2029262/content) (Jonathan Krause, 2013) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | {'test': 16082} | {'test': {'number_of_characters': 0, 'num_samples': 16082, 'num_queries': 8041, 'num_documents': 8041, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 8041, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 8041, 'min_relevant_docs_per_query': 23, 'average_relevant_docs_per_query': 40.49, 'max_relevant_docs_per_query': 67, 'unique_relevant_docs': 8041}} | @@ -753,10 +860,10 @@ The following tables give you an overview of the tasks in MTEB. | [SugarCrepe](https://proceedings.neurips.cc/paper_files/paper/2023/hash/63461de0b4cb760fc498e85b18a7fe81-Abstract-Datasets_and_Benchmarks.html) (Hsieh et al., 2024) | ['eng'] | Compositionality | i2t | [Encyclopaedic] | {'test': 7511} | {'test': {'num_samples': 7511, 'num_images': 7511, 'num_texts': 15022, 'num_unique_texts': 11844, 'min_text_length': 24, 'average_text_length': 56.49, 'max_text_length': 210}} | | [SummEvalFrSummarization.v2](https://github.com/Yale-LILY/SummEval) (Fabbri et al., 2020) | ['fra'] | Summarization | p2p | [News, Written] | None | None | | [SummEvalSummarization.v2](https://github.com/Yale-LILY/SummEval) (Fabbri et al., 2020) | ['eng'] | Summarization | p2p | [News, Written] | None | None | -| [SwahiliNewsClassification](https://huggingface.co/datasets/Mollel/SwahiliNewsClassification) (Davis et al., 2020) | ['swa'] | Classification | s2s | [News, Written] | None | None | +| [SwahiliNewsClassification.v2](https://huggingface.co/datasets/Mollel/SwahiliNewsClassification) (Davis et al., 2020) | ['swa'] | Classification | s2s | [News, Written] | None | None | | [SweFaqRetrieval](https://spraakbanken.gu.se/en/resources/superlim) (Berdi{\v{c, 2023) | ['swe'] | Retrieval | s2s | [Government, Non-fiction, Written] | None | None | -| [SweRecClassification](https://aclanthology.org/2023.nodalida-1.20/) (Nielsen et al., 2023) | ['swe'] | Classification | s2s | [Reviews, Written] | None | None | -| [SwedishSentimentClassification](https://huggingface.co/datasets/swedish_reviews) | ['swe'] | Classification | s2s | [Reviews, Written] | None | None | +| [SweRecClassification.v2](https://aclanthology.org/2023.nodalida-1.20/) (Nielsen et al., 2023) | ['swe'] | Classification | s2s | [Reviews, Written] | None | None | +| [SwedishSentimentClassification.v2](https://huggingface.co/datasets/swedish_reviews) | ['swe'] | Classification | s2s | [Reviews, Written] | None | None | | [SwednClusteringP2P](https://spraakbanken.gu.se/en/resources/swedn) (Monsen et al., 2021) | ['swe'] | Clustering | p2p | [News, Non-fiction, Written] | None | None | | [SwednClusteringS2S](https://spraakbanken.gu.se/en/resources/swedn) (Monsen et al., 2021) | ['swe'] | Clustering | s2s | [News, Non-fiction, Written] | None | None | | [SwednRetrieval](https://spraakbanken.gu.se/en/resources/swedn) (Monsen et al., 2021) | ['swe'] | Retrieval | p2p | [News, Non-fiction, Written] | None | None | @@ -788,26 +895,27 @@ The following tables give you an overview of the tasks in MTEB. | [SynPerQARetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-qa-retrieval/) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [SynPerSTS](https://mcinext.com/) | ['fas'] | STS | s2s | [Blog, News, Religious, Web] | None | None | | [SynPerTextKeywordsPC](https://mcinext.com/) | ['fas'] | PairClassification | s2p | [Blog, News, Religious, Web] | None | None | -| [SynPerTextToneClassification](https://mcinext.com/) | ['fas'] | Classification | s2p | | None | None | +| [SynPerTextToneClassification.v2](https://mcinext.com/) | ['fas'] | Classification | s2p | | None | None | | [SyntecReranking](https://huggingface.co/datasets/lyon-nlp/mteb-fr-reranking-syntec-s2p) (Mathieu Ciancone, 2024) | ['fra'] | Reranking | s2p | [Legal, Written] | None | None | | [SyntecRetrieval](https://huggingface.co/datasets/lyon-nlp/mteb-fr-retrieval-syntec-s2p) (Mathieu Ciancone, 2024) | ['fra'] | Retrieval | s2p | [Legal, Written] | None | None | | [SyntheticText2SQL](https://huggingface.co/datasets/gretelai/synthetic_text_to_sql) (Meyer et al., 2024) | ['eng', 'sql'] | Retrieval | p2p | [Programming, Written] | {'test': 111702} | {'test': {'number_of_characters': 14041553, 'num_samples': 111702, 'num_queries': 5851, 'num_documents': 105851, 'min_document_length': 13, 'average_document_length': 4.58, 'max_document_length': 281, 'unique_documents': 105851, 'min_query_length': 17, 'average_query_length': 2316.95, 'max_query_length': 762, 'unique_queries': 5851, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 5851}} | | [T2Reranking](https://arxiv.org/abs/2304.03679) (Xiaohui Xie, 2023) | ['cmn'] | Reranking | s2s | | None | None | | [T2Retrieval](https://arxiv.org/abs/2304.03679) (Xiaohui Xie, 2023) | ['cmn'] | Retrieval | s2p | [Academic, Financial, Government, Medical, Non-fiction] | None | None | | [TERRa](https://arxiv.org/pdf/2010.15925) (Shavrina et al., 2020) | ['rus'] | PairClassification | s2s | [News, Web, Written] | None | None | -| [TNews](https://www.cluebenchmarks.com/introduce.html) (Xu et al., 2020) | ['cmn'] | Classification | s2s | | None | None | +| [TNews.v2](https://www.cluebenchmarks.com/introduce.html) (Xu et al., 2020) | ['cmn'] | Classification | s2s | | None | None | | [TRECCOVID](https://ir.nist.gov/covidSubmit/index.html) (Kirk Roberts, 2021) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [TRECCOVID-Fa](https://huggingface.co/datasets/MCINext/trec-covid-fa) | ['fas'] | Retrieval | s2p | [Medical] | None | None | | [TRECCOVID-NL](https://colab.research.google.com/drive/1R99rjeAGt8S9IfAIRR3wS052sNu3Bjo-#scrollTo=4HduGW6xHnrZ) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [TRECCOVID-PL](https://ir.nist.gov/covidSubmit/index.html) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Academic, Medical, Non-fiction, Written] | None | None | +| [TRECCOVID-VN](https://ir.nist.gov/covidSubmit/index.html) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [TUBerlinT2IRetrieval](https://dl.acm.org/doi/pdf/10.1145/2185520.2185540?casa_token=tq-eUx5UROYAAAAA:_694nPzE7tali6LCkxQc0M-mlo9xslasPMcVnFPMy9tDfvt7lg7p1RTe-k8VWCjuv9gmkQqasKUZ) (Eitz et al., 2012) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | {'test': 20250} | {'test': {'number_of_characters': 1810, 'num_samples': 20250, 'num_queries': 250, 'num_documents': 20000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 20000, 'min_query_length': 2, 'average_query_length': 7.24, 'max_query_length': 18, 'unique_queries': 250, 'num_query_images': 0, 'min_relevant_docs_per_query': 80, 'average_relevant_docs_per_query': 80.0, 'max_relevant_docs_per_query': 80, 'unique_relevant_docs': 20000}} | | [TV2Nordretrieval](https://huggingface.co/datasets/alexandrainst/nordjylland-news-summarization) (Flansmose Mikkelsen et al., 2022) | ['dan'] | Retrieval | p2p | [News, Non-fiction, Written] | None | None | | [TalemaaderPC](https://sprogteknologi.dk/dataset/1000-talemader-evalueringsdatasaet) ({Det Danske Sprog- og Litteraturselskab, 2024) | ['dan'] | PairClassification | s2s | [Academic, Written] | None | None | -| [TamilNewsClassification](https://github.com/vanangamudi/tamil-news-classification) (Anoop Kunchukuttan, 2020) | ['tam'] | Classification | s2s | [News, Written] | None | None | +| [TamilNewsClassification.v2](https://github.com/vanangamudi/tamil-news-classification) (Anoop Kunchukuttan, 2020) | ['tam'] | Classification | s2s | [News, Written] | None | None | | [Tatoeba](https://github.com/facebookresearch/LASER/tree/main/data/tatoeba/v1) (Tatoeba community, 2021) | ['afr', 'amh', 'ang', 'ara', 'arq', 'arz', 'ast', 'awa', 'aze', 'bel', 'ben', 'ber', 'bos', 'bre', 'bul', 'cat', 'cbk', 'ceb', 'ces', 'cha', 'cmn', 'cor', 'csb', 'cym', 'dan', 'deu', 'dsb', 'dtp', 'ell', 'eng', 'epo', 'est', 'eus', 'fao', 'fin', 'fra', 'fry', 'gla', 'gle', 'glg', 'gsw', 'heb', 'hin', 'hrv', 'hsb', 'hun', 'hye', 'ido', 'ile', 'ina', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kat', 'kaz', 'khm', 'kor', 'kur', 'kzj', 'lat', 'lfn', 'lit', 'lvs', 'mal', 'mar', 'max', 'mhr', 'mkd', 'mon', 'nds', 'nld', 'nno', 'nob', 'nov', 'oci', 'orv', 'pam', 'pes', 'pms', 'pol', 'por', 'ron', 'rus', 'slk', 'slv', 'spa', 'sqi', 'srp', 'swe', 'swg', 'swh', 'tam', 'tat', 'tel', 'tgl', 'tha', 'tuk', 'tur', 'tzl', 'uig', 'ukr', 'urd', 'uzb', 'vie', 'war', 'wuu', 'xho', 'yid', 'yue', 'zsm'] | BitextMining | s2s | [Written] | None | None | | [TbilisiCityHallBitextMining](https://huggingface.co/datasets/jupyterjazz/tbilisi-city-hall-titles) | ['eng', 'kat'] | BitextMining | s2s | [News, Written] | {'test': 3640} | {'test': {'num_samples': 3640, 'number_of_characters': 572146, 'unique_pairs': 3640, 'min_sentence1_length': 13, 'average_sentence1_length': 78.59, 'max_sentence1_length': 203, 'unique_sentence1': 3636, 'min_sentence2_length': 13, 'average_sentence2_length': 78.59, 'max_sentence2_length': 203, 'unique_sentence2': 3636, 'hf_subset_descriptive_stats': {'kat_Geor-eng_Latn': {'num_samples': 1820, 'number_of_characters': 286073, 'unique_pairs': 1820, 'min_sentence1_length': 30, 'average_sentence1_length': 76.07, 'max_sentence1_length': 189, 'unique_sentence1': 1820, 'min_sentence2_length': 13, 'average_sentence2_length': 81.12, 'max_sentence2_length': 203, 'unique_sentence2': 1816}, 'eng_Latn-kat_Geor': {'num_samples': 1820, 'number_of_characters': 286073, 'unique_pairs': 1820, 'min_sentence1_length': 13, 'average_sentence1_length': 81.12, 'max_sentence1_length': 203, 'unique_sentence1': 1816, 'min_sentence2_length': 30, 'average_sentence2_length': 76.07, 'max_sentence2_length': 189, 'unique_sentence2': 1820}}}} | | [TelemarketingSalesRuleLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [TeluguAndhraJyotiNewsClassification](https://github.com/AnushaMotamarri/Telugu-Newspaper-Article-Dataset) | ['tel'] | Classification | s2s | [News, Written] | None | None | +| [TeluguAndhraJyotiNewsClassification.v2](https://github.com/AnushaMotamarri/Telugu-Newspaper-Article-Dataset) | ['tel'] | Classification | s2s | [News, Written] | None | None | | [TempReasonL1](https://github.com/DAMO-NLP-SG/TempReason) (Tan et al., 2023) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [TempReasonL2Context](https://github.com/DAMO-NLP-SG/TempReason) (Tan et al., 2023) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [TempReasonL2Fact](https://github.com/DAMO-NLP-SG/TempReason) (Tan et al., 2023) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | @@ -815,7 +923,7 @@ The following tables give you an overview of the tasks in MTEB. | [TempReasonL3Context](https://github.com/DAMO-NLP-SG/TempReason) (Tan et al., 2023) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [TempReasonL3Fact](https://github.com/DAMO-NLP-SG/TempReason) (Tan et al., 2023) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [TempReasonL3Pure](https://github.com/DAMO-NLP-SG/TempReason) (Tan et al., 2023) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | -| [TenKGnadClassification](https://tblock.github.io/10kGNAD/) (Dietmar Schabus, 2017) | ['deu'] | Classification | p2p | [News, Written] | None | None | +| [TenKGnadClassification.v2](https://tblock.github.io/10kGNAD/) (Dietmar Schabus, 2017) | ['deu'] | Classification | p2p | [News, Written] | None | None | | [TenKGnadClusteringP2P.v2](https://tblock.github.io/10kGNAD/) | ['deu'] | Clustering | p2p | [News, Non-fiction, Written] | None | None | | [TenKGnadClusteringS2S.v2](https://tblock.github.io/10kGNAD/) | ['deu'] | Clustering | s2s | [News, Non-fiction, Written] | None | None | | [TextualismToolDictionariesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | @@ -827,29 +935,35 @@ The following tables give you an overview of the tasks in MTEB. | [TopiOCQAHardNegatives](https://mcgill-nlp.github.io/topiocqa) (Vaibhav Adlakha, 2022) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [Touche2020-Fa](https://huggingface.co/datasets/MCINext/touche2020-fa) | ['fas'] | Retrieval | s2p | [Spoken] | None | None | | [Touche2020-NL](https://huggingface.co/datasets/clips/beir-nl-webis-touche2020) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Academic, Non-fiction] | None | None | +| [Touche2020-VN](https://webis.de/events/touche-20/shared-task-1.html) (Loc Pham, 2025) | ['vie'] | Retrieval | s2p | [Academic] | None | None | | [Touche2020Retrieval.v3](https://github.com/castorini/touche-error-analysis) (Nandan Thakur, 2024) | ['eng'] | Retrieval | s2p | [Academic] | {'test': 303781} | {'test': {'number_of_characters': 637047138, 'num_samples': 303781, 'num_queries': 49, 'num_documents': 303732, 'min_document_length': 16, 'average_document_length': 0.01, 'max_document_length': 83, 'unique_documents': 303732, 'min_query_length': 41, 'average_query_length': 13000918.57, 'max_query_length': 105983, 'unique_queries': 49, 'min_relevant_docs_per_query': 40, 'average_relevant_docs_per_query': 58.14, 'max_relevant_docs_per_query': 87, 'unique_relevant_docs': 2732}} | -| [ToxicChatClassification](https://aclanthology.org/2023.findings-emnlp.311/) (Zi Lin, 2023) | ['eng'] | Classification | s2s | [Constructed, Written] | None | None | -| [ToxicConversationsClassification](https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/overview) (cjadams, 2019) | ['eng'] | Classification | s2s | [Social, Written] | None | None | -| [TswanaNewsClassification](https://link.springer.com/chapter/10.1007/978-3-031-49002-6_17) (Vukosi Marivate, 2023) | ['tsn'] | Classification | s2s | [News, Written] | None | None | +| [ToxicChatClassification.v2](https://aclanthology.org/2023.findings-emnlp.311/) (Zi Lin, 2023) | ['eng'] | Classification | s2s | [Constructed, Written] | None | None | +| [ToxicConversationsClassification.v2](https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/overview) (cjadams, 2019) | ['eng'] | Classification | s2s | [Social, Written] | None | None | +| [ToxicConversationsVNClassification](https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/overview) (Loc Pham, 2025) | ['vie'] | Classification | s2s | [Social, Written] | None | None | +| [TswanaNewsClassification.v2](https://link.springer.com/chapter/10.1007/978-3-031-49002-6_17) (Vukosi Marivate, 2023) | ['tsn'] | Classification | s2s | [News, Written] | None | None | | [TurHistQuadRetrieval](https://github.com/okanvk/Turkish-Reading-Comprehension-Question-Answering-Dataset) (Soygazi et al., 2021) | ['tur'] | Retrieval | p2p | [Academic, Encyclopaedic, Non-fiction, Written] | None | None | | [TurkicClassification](https://huggingface.co/datasets/Electrotubbie/classification_Turkic_languages/) | ['bak', 'kaz', 'kir'] | Classification | s2s | [News, Written] | None | None | -| [TurkishMovieSentimentClassification](https://www.win.tue.nl/~mpechen/publications/pubs/MT_WISDOM2013.pdf) (Erkin Demirtas, 2013) | ['tur'] | Classification | s2s | [Reviews, Written] | None | None | -| [TurkishProductSentimentClassification](https://www.win.tue.nl/~mpechen/publications/pubs/MT_WISDOM2013.pdf) (Erkin Demirtas, 2013) | ['tur'] | Classification | s2s | [Reviews, Written] | None | None | +| [TurkishMovieSentimentClassification.v2](https://www.win.tue.nl/~mpechen/publications/pubs/MT_WISDOM2013.pdf) (Erkin Demirtas, 2013) | ['tur'] | Classification | s2s | [Reviews, Written] | None | None | +| [TurkishProductSentimentClassification.v2](https://www.win.tue.nl/~mpechen/publications/pubs/MT_WISDOM2013.pdf) (Erkin Demirtas, 2013) | ['tur'] | Classification | s2s | [Reviews, Written] | None | None | | [TweetEmotionClassification.v2](https://link.springer.com/chapter/10.1007/978-3-319-77116-8_8) (Al-Khatib et al., 2018) | ['ara'] | Classification | s2s | [Social, Written] | None | None | | [TweetSarcasmClassification.v2](https://aclanthology.org/2020.osact-1.5/) (Abu Farha et al., 2020) | ['ara'] | Classification | s2s | [Social, Written] | None | None | | [TweetSentimentClassification](https://aclanthology.org/2022.lrec-1.27) (Barbieri et al., 2022) | ['ara', 'deu', 'eng', 'fra', 'hin', 'ita', 'por', 'spa'] | Classification | s2s | [Social, Written] | None | None | -| [TweetSentimentExtractionClassification](https://www.kaggle.com/competitions/tweet-sentiment-extraction/overview) (Maggie et al., 2020) | ['eng'] | Classification | s2s | [Social, Written] | None | None | -| [TweetTopicSingleClassification](https://arxiv.org/abs/2209.09824) (Antypas et al., 2022) | ['eng'] | Classification | s2s | [News, Social, Written] | None | None | +| [TweetSentimentExtractionClassification.v2](https://www.kaggle.com/competitions/tweet-sentiment-extraction/overview) (Maggie et al., 2020) | ['eng'] | Classification | s2s | [Social, Written] | None | None | +| [TweetSentimentExtractionVNClassification](https://www.kaggle.com/competitions/tweet-sentiment-extraction/overview) (Loc Pham, 2025) | ['vie'] | Classification | s2s | [Social, Written] | None | None | +| [TweetTopicSingleClassification.v2](https://arxiv.org/abs/2209.09824) (Antypas et al., 2022) | ['eng'] | Classification | s2s | [News, Social, Written] | None | None | +| [TwentyNewsgroupsClustering-VN](https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html) (Loc Pham, 2025) | ['vie'] | Clustering | s2s | [News, Written] | None | None | | [TwentyNewsgroupsClustering.v2](https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html) (Ken Lang, 1995) | ['eng'] | Clustering | s2s | [News, Written] | {'test': 59545} | {'test': {'num_samples': 59545, 'number_of_characters': 1907719, 'min_text_length': 11, 'average_text_length': 32.04, 'max_text_length': 120, 'min_labels_per_text': 2082, 'average_labels_per_text': 1.0, 'max_labels_per_text': 3236, 'unique_labels': 20, 'labels': {'12': {'count': 3137}, '6': {'count': 3070}, '0': {'count': 2613}, '2': {'count': 3155}, '10': {'count': 3220}, '17': {'count': 2986}, '14': {'count': 3106}, '13': {'count': 3055}, '1': {'count': 3056}, '16': {'count': 2911}, '9': {'count': 2984}, '3': {'count': 3070}, '15': {'count': 3090}, '7': {'count': 3036}, '5': {'count': 3124}, '11': {'count': 3236}, '18': {'count': 2483}, '8': {'count': 3090}, '19': {'count': 2082}, '4': {'count': 3041}}}} | | [TwitterHjerneRetrieval](https://huggingface.co/datasets/sorenmulli/da-hashtag-twitterhjerne) (Holm et al., 2024) | ['dan'] | Retrieval | p2p | [Social, Written] | None | None | | [TwitterSemEval2015](https://alt.qcri.org/semeval2015/task1/) (Xu et al., 2015) | ['eng'] | PairClassification | s2s | [Social, Written] | None | None | +| [TwitterSemEval2015-VN](https://alt.qcri.org/semeval2015/task1/) (Loc Pham, 2025) | ['vie'] | PairClassification | s2s | [Social, Written] | None | None | | [TwitterURLCorpus](https://languagenet.github.io/) (Lan et al., 2017) | ['eng'] | PairClassification | s2s | [Social, Written] | {'test': 51534} | {'test': {'num_samples': 51534, 'number_of_characters': 8659940, 'min_sentence1_length': 24, 'avg_sentence1_length': 79.49, 'max_sentence1_length': 126, 'unique_sentence1': 4329, 'min_sentence2_length': 6, 'avg_sentence2_length': 88.55, 'max_sentence2_length': 608, 'unique_sentence2': 41304, 'unique_labels': 2, 'labels': {'0': {'count': 38546}, '1': {'count': 12988}}}} | +| [TwitterURLCorpus-VN](https://languagenet.github.io/) (Loc Pham, 2025) | ['vie'] | PairClassification | s2s | [Social, Written] | None | None | | [UCCVCommonLawLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [UCF101](https://huggingface.co/datasets/flwrlabs/ucf101) (Khurram Soomro, 2012) | ['eng'] | ImageClassification | i2i | [Scene] | {'test': 697222} | {'test': {'num_samples': 697222, 'unique_num_labels': 101, 'min_image_width': 320, 'average_image_width': 320.12, 'max_image_width': 400, 'min_image_height': 226, 'average_image_height': 239.98, 'max_image_height': 240, 'labels': {'0': {'count': 7475}, '1': {'count': 6341}, '2': {'count': 6181}, '3': {'count': 6320}, '4': {'count': 3708}, '5': {'count': 7296}, '6': {'count': 4004}, '7': {'count': 3923}, '8': {'count': 2267}, '9': {'count': 5587}, '10': {'count': 8946}, '11': {'count': 12714}, '12': {'count': 6053}, '13': {'count': 3191}, '14': {'count': 3696}, '15': {'count': 5468}, '16': {'count': 10032}, '17': {'count': 8346}, '18': {'count': 5098}, '19': {'count': 10811}, '20': {'count': 6378}, '21': {'count': 3385}, '22': {'count': 3974}, '23': {'count': 4781}, '24': {'count': 5867}, '25': {'count': 7904}, '26': {'count': 12181}, '27': {'count': 4511}, '28': {'count': 4402}, '29': {'count': 5513}, '30': {'count': 3236}, '31': {'count': 7160}, '32': {'count': 6455}, '33': {'count': 3766}, '34': {'count': 8362}, '35': {'count': 3521}, '36': {'count': 3263}, '37': {'count': 5112}, '38': {'count': 9685}, '39': {'count': 4598}, '40': {'count': 6682}, '41': {'count': 8690}, '42': {'count': 3591}, '43': {'count': 11432}, '44': {'count': 3458}, '45': {'count': 10080}, '46': {'count': 16507}, '47': {'count': 3001}, '48': {'count': 6524}, '49': {'count': 7786}, '50': {'count': 4657}, '51': {'count': 8795}, '52': {'count': 3992}, '53': {'count': 5668}, '54': {'count': 6575}, '55': {'count': 8662}, '56': {'count': 5253}, '57': {'count': 3761}, '58': {'count': 8679}, '59': {'count': 11986}, '60': {'count': 15720}, '61': {'count': 12080}, '62': {'count': 10634}, '63': {'count': 6161}, '64': {'count': 13934}, '65': {'count': 8393}, '66': {'count': 5452}, '67': {'count': 7905}, '68': {'count': 12354}, '69': {'count': 4060}, '70': {'count': 9075}, '71': {'count': 2689}, '72': {'count': 5435}, '73': {'count': 17655}, '74': {'count': 5693}, '75': {'count': 12572}, '76': {'count': 9543}, '77': {'count': 10793}, '78': {'count': 4134}, '79': {'count': 4832}, '80': {'count': 8977}, '81': {'count': 7381}, '82': {'count': 4927}, '83': {'count': 12469}, '84': {'count': 3843}, '85': {'count': 4945}, '86': {'count': 6724}, '87': {'count': 6582}, '88': {'count': 7046}, '89': {'count': 5874}, '90': {'count': 4878}, '91': {'count': 6417}, '92': {'count': 3762}, '93': {'count': 7349}, '94': {'count': 8149}, '95': {'count': 3925}, '96': {'count': 3378}, '97': {'count': 7721}, '98': {'count': 3671}, '99': {'count': 6292}, '100': {'count': 6508}}}} | | [UCF101ZeroShot](https://huggingface.co/datasets/flwrlabs/ucf101) (Khurram Soomro, 2012) | ['eng'] | ZeroShotClassification | i2t | [Scene] | {'test': 697222} | {'test': {'num_samples': 697222, 'unique_num_labels': 101, 'min_image_width': 320, 'average_image_width': 320.12, 'max_image_width': 400, 'min_image_height': 226, 'average_image_height': 239.98, 'max_image_height': 240, 'min_label_text_length': 15, 'average_label_text_length': 21.77, 'max_label_text_length': 29, 'labels': {'0': {'count': 7475}, '1': {'count': 6341}, '2': {'count': 6181}, '3': {'count': 6320}, '4': {'count': 3708}, '5': {'count': 7296}, '6': {'count': 4004}, '7': {'count': 3923}, '8': {'count': 2267}, '9': {'count': 5587}, '10': {'count': 8946}, '11': {'count': 12714}, '12': {'count': 6053}, '13': {'count': 3191}, '14': {'count': 3696}, '15': {'count': 5468}, '16': {'count': 10032}, '17': {'count': 8346}, '18': {'count': 5098}, '19': {'count': 10811}, '20': {'count': 6378}, '21': {'count': 3385}, '22': {'count': 3974}, '23': {'count': 4781}, '24': {'count': 5867}, '25': {'count': 7904}, '26': {'count': 12181}, '27': {'count': 4511}, '28': {'count': 4402}, '29': {'count': 5513}, '30': {'count': 3236}, '31': {'count': 7160}, '32': {'count': 6455}, '33': {'count': 3766}, '34': {'count': 8362}, '35': {'count': 3521}, '36': {'count': 3263}, '37': {'count': 5112}, '38': {'count': 9685}, '39': {'count': 4598}, '40': {'count': 6682}, '41': {'count': 8690}, '42': {'count': 3591}, '43': {'count': 11432}, '44': {'count': 3458}, '45': {'count': 10080}, '46': {'count': 16507}, '47': {'count': 3001}, '48': {'count': 6524}, '49': {'count': 7786}, '50': {'count': 4657}, '51': {'count': 8795}, '52': {'count': 3992}, '53': {'count': 5668}, '54': {'count': 6575}, '55': {'count': 8662}, '56': {'count': 5253}, '57': {'count': 3761}, '58': {'count': 8679}, '59': {'count': 11986}, '60': {'count': 15720}, '61': {'count': 12080}, '62': {'count': 10634}, '63': {'count': 6161}, '64': {'count': 13934}, '65': {'count': 8393}, '66': {'count': 5452}, '67': {'count': 7905}, '68': {'count': 12354}, '69': {'count': 4060}, '70': {'count': 9075}, '71': {'count': 2689}, '72': {'count': 5435}, '73': {'count': 17655}, '74': {'count': 5693}, '75': {'count': 12572}, '76': {'count': 9543}, '77': {'count': 10793}, '78': {'count': 4134}, '79': {'count': 4832}, '80': {'count': 8977}, '81': {'count': 7381}, '82': {'count': 4927}, '83': {'count': 12469}, '84': {'count': 3843}, '85': {'count': 4945}, '86': {'count': 6724}, '87': {'count': 6582}, '88': {'count': 7046}, '89': {'count': 5874}, '90': {'count': 4878}, '91': {'count': 6417}, '92': {'count': 3762}, '93': {'count': 7349}, '94': {'count': 8149}, '95': {'count': 3925}, '96': {'count': 3378}, '97': {'count': 7721}, '98': {'count': 3671}, '99': {'count': 6292}, '100': {'count': 6508}}}} | -| [UkrFormalityClassification](https://huggingface.co/datasets/ukr-detect/ukr-formality-dataset-translated-gyafc) (Rao et al., 2018) | ['ukr'] | Classification | s2s | [News, Written] | None | None | +| [UkrFormalityClassification.v2](https://huggingface.co/datasets/ukr-detect/ukr-formality-dataset-translated-gyafc) (Rao et al., 2018) | ['ukr'] | Classification | s2s | [News, Written] | None | None | | [UnfairTOSLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [UrduRomanSentimentClassification](https://archive.ics.uci.edu/dataset/458/roman+urdu+data+set) (Sharf,Zareen, 2018) | ['urd'] | Classification | s2s | [Social, Written] | None | None | +| [UrduRomanSentimentClassification.v2](https://archive.ics.uci.edu/dataset/458/roman+urdu+data+set) (Sharf,Zareen, 2018) | ['urd'] | Classification | s2s | [Social, Written] | None | None | | [VDRMultilingualRetrieval](https://huggingface.co/datasets/llamaindex/vdr-multilingual-test) (LlamaIndex, 2025) | ['deu', 'eng', 'fra', 'ita', 'spa'] | Retrieval | it2it | [Web] | None | None | | [VGHierarchicalClusteringP2P](https://huggingface.co/datasets/navjordj/VG_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | p2p | [News, Non-fiction, Written] | None | None | | [VGHierarchicalClusteringS2S](https://huggingface.co/datasets/navjordj/VG_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | p2p | [News, Non-fiction, Written] | None | None | @@ -872,7 +986,7 @@ The following tables give you an overview of the tasks in MTEB. | [VidoreTatdqaRetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | DocumentUnderstanding | t2i | [Academic] | {'test': 1923} | {'test': {'number_of_characters': 120235, 'num_samples': 1923, 'num_queries': 1646, 'num_documents': 277, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 277, 'min_query_length': 15, 'average_query_length': 73.05, 'max_query_length': 194, 'unique_queries': 1646, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.01, 'max_relevant_docs_per_query': 10, 'unique_relevant_docs': 277}} | | [VieMedEVBitextMining](https://aclanthology.org/2015.iwslt-evaluation.11/) (Nhu Vo, 2024) | ['eng', 'vie'] | BitextMining | s2s | [Medical, Written] | {'test': 2048} | {'test': {'num_samples': 2048, 'number_of_characters': 575910, 'unique_pairs': 2048, 'min_sentence1_length': 11, 'average_sentence1_length': 139.23, 'max_sentence1_length': 1291, 'unique_sentence1': 2048, 'min_sentence2_length': 11, 'average_sentence2_length': 141.98, 'max_sentence2_length': 1217, 'unique_sentence2': 2047}} | | [VieQuADRetrieval](https://aclanthology.org/2020.coling-main.233.pdf) (Nguyen et al., 2020) | ['vie'] | Retrieval | s2p | [Encyclopaedic, Non-fiction, Written] | None | None | -| [VieStudentFeedbackClassification](https://ieeexplore.ieee.org/document/8573337) (Nguyen et al., 2018) | ['vie'] | Classification | s2s | [Reviews, Written] | None | None | +| [VieStudentFeedbackClassification.v2](https://ieeexplore.ieee.org/document/8573337) (Nguyen et al., 2018) | ['vie'] | Classification | s2s | [Reviews, Written] | None | None | | [VisualNewsI2TRetrieval](https://aclanthology.org/2021.emnlp-main.542/) (Liu et al., 2021) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | {'test': 557568} | {'test': {'number_of_characters': 58422402, 'num_samples': 557568, 'num_queries': 20000, 'num_documents': 537568, 'min_document_length': 2, 'average_document_length': 108.68, 'max_document_length': 2751, 'unique_documents': 537568, 'num_document_images': 0, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 20000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 19995}} | | [VisualNewsT2IRetrieval](https://aclanthology.org/2021.emnlp-main.542/) (Liu et al., 2021) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | {'test': 562241} | {'test': {'number_of_characters': 2204490, 'num_samples': 562241, 'num_queries': 19995, 'num_documents': 542246, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 542246, 'min_query_length': 2, 'average_query_length': 110.25, 'max_query_length': 1356, 'unique_queries': 19995, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 20000}} | | [VisualSTS-b-Eng](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS(eng) | i2i | [News, Social, Spoken, Web, Written] | None | None | @@ -882,8 +996,8 @@ The following tables give you an overview of the tasks in MTEB. | [VizWizIT2TRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/papers/Gurari_VizWiz_Grand_Challenge_CVPR_2018_paper.pdf) (Gurari et al., 2018) | ['eng'] | Any2AnyRetrieval | it2t | [Web] | {'test': 6410} | {'test': {'number_of_characters': 181824, 'num_samples': 6410, 'num_queries': 4319, 'num_documents': 2091, 'min_document_length': 1, 'average_document_length': 14.45, 'max_document_length': 94, 'unique_documents': 2091, 'num_document_images': 0, 'min_query_length': 7, 'average_query_length': 35.1, 'max_query_length': 264, 'unique_queries': 2798, 'num_query_images': 4319, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2091}} | | [VoyageMMarcoReranking](https://arxiv.org/abs/2312.16144) (Benjamin Clavié, 2023) | ['jpn'] | Reranking | s2s | [Academic, Non-fiction, Written] | None | None | | [WITT2IRetrieval](https://proceedings.mlr.press/v162/bugliarello22a/bugliarello22a.pdf) (Bugliarello et al., 2022) | ['ara', 'bul', 'dan', 'ell', 'eng', 'est', 'ind', 'jpn', 'kor', 'tur', 'vie'] | Any2AnyMultilingualRetrieval | t2i | [Encyclopaedic, Written] | {'test': 18137} | {'test': {'number_of_characters': 506601, 'num_samples': 18137, 'num_queries': 9584, 'num_documents': 8553, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 8553, 'min_query_length': 9, 'average_query_length': 52.86, 'max_query_length': 779, 'unique_queries': 9076, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 8553, 'hf_subset_descriptive_stats': {'ar': {'number_of_characters': 46144, 'num_samples': 1682, 'num_queries': 890, 'num_documents': 792, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 792, 'min_query_length': 4, 'average_query_length': 51.85, 'max_query_length': 533, 'unique_queries': 871, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 792}, 'bg': {'number_of_characters': 40682, 'num_samples': 1666, 'num_queries': 860, 'num_documents': 806, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 806, 'min_query_length': 4, 'average_query_length': 47.3, 'max_query_length': 771, 'unique_queries': 830, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 806}, 'da': {'number_of_characters': 48235, 'num_samples': 1705, 'num_queries': 891, 'num_documents': 814, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 814, 'min_query_length': 4, 'average_query_length': 54.14, 'max_query_length': 537, 'unique_queries': 889, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 814}, 'el': {'number_of_characters': 30842, 'num_samples': 1111, 'num_queries': 570, 'num_documents': 541, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 541, 'min_query_length': 1, 'average_query_length': 54.11, 'max_query_length': 404, 'unique_queries': 565, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 541}, 'et': {'number_of_characters': 33995, 'num_samples': 1654, 'num_queries': 874, 'num_documents': 780, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 780, 'min_query_length': 3, 'average_query_length': 38.9, 'max_query_length': 588, 'unique_queries': 750, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 780}, 'id': {'number_of_characters': 45428, 'num_samples': 1755, 'num_queries': 901, 'num_documents': 854, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 854, 'min_query_length': 1, 'average_query_length': 50.42, 'max_query_length': 628, 'unique_queries': 863, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 854}, 'ko': {'number_of_characters': 18304, 'num_samples': 1820, 'num_queries': 931, 'num_documents': 889, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 889, 'min_query_length': 2, 'average_query_length': 19.66, 'max_query_length': 168, 'unique_queries': 905, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 889}, 'ja': {'number_of_characters': 21706, 'num_samples': 1842, 'num_queries': 1000, 'num_documents': 842, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 842, 'min_query_length': 2, 'average_query_length': 21.71, 'max_query_length': 368, 'unique_queries': 875, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 842}, 'tr': {'number_of_characters': 33434, 'num_samples': 1402, 'num_queries': 721, 'num_documents': 681, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 681, 'min_query_length': 4, 'average_query_length': 46.37, 'max_query_length': 408, 'unique_queries': 712, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 681}, 'vi': {'number_of_characters': 53181, 'num_samples': 1815, 'num_queries': 946, 'num_documents': 869, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 869, 'min_query_length': 3, 'average_query_length': 56.22, 'max_query_length': 476, 'unique_queries': 921, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 869}, 'en': {'number_of_characters': 57978, 'num_samples': 1685, 'num_queries': 1000, 'num_documents': 685, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 685, 'min_query_length': 4, 'average_query_length': 57.98, 'max_query_length': 690, 'unique_queries': 895, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 685}}}} | -| [WRIMEClassification](https://aclanthology.org/2021.naacl-main.169/) (Kajiwara et al., 2021) | ['jpn'] | Classification | s2s | [Social, Written] | None | None | -| [Waimai](https://aclanthology.org/2023.nodalida-1.20/) (Xiao et al., 2023) | ['cmn'] | Classification | s2s | | None | None | +| [WRIMEClassification.v2](https://aclanthology.org/2021.naacl-main.169/) (Kajiwara et al., 2021) | ['jpn'] | Classification | s2s | [Social, Written] | None | None | +| [Waimai.v2](https://aclanthology.org/2023.nodalida-1.20/) (Xiao et al., 2023) | ['cmn'] | Classification | s2s | | None | None | | [WebFAQBitextMiningQAs](https://huggingface.co/PaDaS-Lab) (Michael Dinzinger, 2025) | ['ara', 'aze', 'ben', 'bul', 'cat', 'ces', 'dan', 'deu', 'ell', 'eng', 'est', 'fas', 'fin', 'fra', 'heb', 'hin', 'hrv', 'hun', 'ind', 'isl', 'ita', 'jpn', 'kat', 'kaz', 'kor', 'lav', 'lit', 'mar', 'msa', 'nld', 'nor', 'pol', 'por', 'ron', 'rus', 'slk', 'slv', 'spa', 'srp', 'swe', 'tgl', 'tha', 'tur', 'ukr', 'urd', 'vie', 'zho'] | BitextMining | p2p | [Web, Written] | {'default': 682057} | {'default': {'num_samples': 682057, 'number_of_characters': 526563222, 'unique_pairs': 682057, 'min_sentence1_length': 43, 'average_sentence1_length': 386.04, 'max_sentence1_length': 18410, 'unique_sentence1': 379274, 'min_sentence2_length': 41, 'average_sentence2_length': 385.98, 'max_sentence2_length': 21081, 'unique_sentence2': 398878, 'hf_subset_descriptive_stats': {'ara-fas': {'num_samples': 609, 'number_of_characters': 411293, 'unique_pairs': 609, 'min_sentence1_length': 54, 'average_sentence1_length': 321.44, 'max_sentence1_length': 2446, 'unique_sentence1': 609, 'min_sentence2_length': 63, 'average_sentence2_length': 353.91, 'max_sentence2_length': 2754, 'unique_sentence2': 609}, 'ara-heb': {'num_samples': 978, 'number_of_characters': 628664, 'unique_pairs': 978, 'min_sentence1_length': 68, 'average_sentence1_length': 336.2, 'max_sentence1_length': 4204, 'unique_sentence1': 978, 'min_sentence2_length': 62, 'average_sentence2_length': 306.61, 'max_sentence2_length': 3077, 'unique_sentence2': 978}, 'jpn-kor': {'num_samples': 4820, 'number_of_characters': 1914641, 'unique_pairs': 4820, 'min_sentence1_length': 43, 'average_sentence1_length': 194.26, 'max_sentence1_length': 1741, 'unique_sentence1': 4820, 'min_sentence2_length': 43, 'average_sentence2_length': 202.97, 'max_sentence2_length': 1830, 'unique_sentence2': 4820}, 'jpn-vie': {'num_samples': 1356, 'number_of_characters': 823292, 'unique_pairs': 1356, 'min_sentence1_length': 50, 'average_sentence1_length': 202.49, 'max_sentence1_length': 1660, 'unique_sentence1': 1356, 'min_sentence2_length': 71, 'average_sentence2_length': 404.66, 'max_sentence2_length': 3938, 'unique_sentence2': 1356}, 'jpn-zho': {'num_samples': 1728, 'number_of_characters': 587091, 'unique_pairs': 1728, 'min_sentence1_length': 49, 'average_sentence1_length': 198.45, 'max_sentence1_length': 6803, 'unique_sentence1': 1728, 'min_sentence2_length': 43, 'average_sentence2_length': 141.3, 'max_sentence2_length': 4500, 'unique_sentence2': 1728}, 'kor-vie': {'num_samples': 1386, 'number_of_characters': 906969, 'unique_pairs': 1386, 'min_sentence1_length': 50, 'average_sentence1_length': 224.99, 'max_sentence1_length': 3677, 'unique_sentence1': 1386, 'min_sentence2_length': 71, 'average_sentence2_length': 429.39, 'max_sentence2_length': 6989, 'unique_sentence2': 1386}, 'kor-zho': {'num_samples': 1087, 'number_of_characters': 354672, 'unique_pairs': 1087, 'min_sentence1_length': 64, 'average_sentence1_length': 193.25, 'max_sentence1_length': 1064, 'unique_sentence1': 1087, 'min_sentence2_length': 45, 'average_sentence2_length': 133.03, 'max_sentence2_length': 729, 'unique_sentence2': 1087}, 'vie-zho': {'num_samples': 646, 'number_of_characters': 341786, 'unique_pairs': 646, 'min_sentence1_length': 89, 'average_sentence1_length': 388.88, 'max_sentence1_length': 2114, 'unique_sentence1': 646, 'min_sentence2_length': 45, 'average_sentence2_length': 140.2, 'max_sentence2_length': 1267, 'unique_sentence2': 646}, 'ind-msa': {'num_samples': 455, 'number_of_characters': 197047, 'unique_pairs': 455, 'min_sentence1_length': 78, 'average_sentence1_length': 217.38, 'max_sentence1_length': 870, 'unique_sentence1': 455, 'min_sentence2_length': 72, 'average_sentence2_length': 215.69, 'max_sentence2_length': 784, 'unique_sentence2': 455}, 'ind-tgl': {'num_samples': 378, 'number_of_characters': 265203, 'unique_pairs': 378, 'min_sentence1_length': 74, 'average_sentence1_length': 329.09, 'max_sentence1_length': 1418, 'unique_sentence1': 378, 'min_sentence2_length': 75, 'average_sentence2_length': 372.51, 'max_sentence2_length': 1600, 'unique_sentence2': 378}, 'ind-tha': {'num_samples': 1258, 'number_of_characters': 794128, 'unique_pairs': 1258, 'min_sentence1_length': 72, 'average_sentence1_length': 347.4, 'max_sentence1_length': 3226, 'unique_sentence1': 1258, 'min_sentence2_length': 63, 'average_sentence2_length': 283.86, 'max_sentence2_length': 2816, 'unique_sentence2': 1258}, 'bul-ces': {'num_samples': 1485, 'number_of_characters': 922134, 'unique_pairs': 1485, 'min_sentence1_length': 71, 'average_sentence1_length': 325.89, 'max_sentence1_length': 1945, 'unique_sentence1': 1485, 'min_sentence2_length': 56, 'average_sentence2_length': 295.08, 'max_sentence2_length': 1921, 'unique_sentence2': 1485}, 'bul-lav': {'num_samples': 710, 'number_of_characters': 492895, 'unique_pairs': 710, 'min_sentence1_length': 74, 'average_sentence1_length': 358.15, 'max_sentence1_length': 2765, 'unique_sentence1': 710, 'min_sentence2_length': 61, 'average_sentence2_length': 336.07, 'max_sentence2_length': 2523, 'unique_sentence2': 710}, 'bul-lit': {'num_samples': 803, 'number_of_characters': 540245, 'unique_pairs': 803, 'min_sentence1_length': 71, 'average_sentence1_length': 346.31, 'max_sentence1_length': 1945, 'unique_sentence1': 803, 'min_sentence2_length': 67, 'average_sentence2_length': 326.47, 'max_sentence2_length': 1925, 'unique_sentence2': 803}, 'bul-pol': {'num_samples': 1635, 'number_of_characters': 1126043, 'unique_pairs': 1635, 'min_sentence1_length': 69, 'average_sentence1_length': 347.74, 'max_sentence1_length': 2431, 'unique_sentence1': 1635, 'min_sentence2_length': 68, 'average_sentence2_length': 340.97, 'max_sentence2_length': 2277, 'unique_sentence2': 1635}, 'bul-rus': {'num_samples': 1476, 'number_of_characters': 995879, 'unique_pairs': 1476, 'min_sentence1_length': 71, 'average_sentence1_length': 337.76, 'max_sentence1_length': 4620, 'unique_sentence1': 1476, 'min_sentence2_length': 63, 'average_sentence2_length': 336.95, 'max_sentence2_length': 4654, 'unique_sentence2': 1476}, 'bul-slk': {'num_samples': 1154, 'number_of_characters': 777946, 'unique_pairs': 1154, 'min_sentence1_length': 73, 'average_sentence1_length': 349.94, 'max_sentence1_length': 1945, 'unique_sentence1': 1154, 'min_sentence2_length': 68, 'average_sentence2_length': 324.19, 'max_sentence2_length': 2073, 'unique_sentence2': 1154}, 'bul-slv': {'num_samples': 1034, 'number_of_characters': 673719, 'unique_pairs': 1034, 'min_sentence1_length': 86, 'average_sentence1_length': 339.07, 'max_sentence1_length': 1945, 'unique_sentence1': 1034, 'min_sentence2_length': 79, 'average_sentence2_length': 312.5, 'max_sentence2_length': 1872, 'unique_sentence2': 1034}, 'bul-srp': {'num_samples': 296, 'number_of_characters': 222838, 'unique_pairs': 296, 'min_sentence1_length': 92, 'average_sentence1_length': 390.41, 'max_sentence1_length': 1945, 'unique_sentence1': 296, 'min_sentence2_length': 87, 'average_sentence2_length': 362.43, 'max_sentence2_length': 1845, 'unique_sentence2': 296}, 'bul-ukr': {'num_samples': 1074, 'number_of_characters': 708525, 'unique_pairs': 1074, 'min_sentence1_length': 64, 'average_sentence1_length': 335.27, 'max_sentence1_length': 2057, 'unique_sentence1': 1074, 'min_sentence2_length': 59, 'average_sentence2_length': 324.43, 'max_sentence2_length': 2042, 'unique_sentence2': 1074}, 'ces-lav': {'num_samples': 875, 'number_of_characters': 569552, 'unique_pairs': 875, 'min_sentence1_length': 74, 'average_sentence1_length': 318.83, 'max_sentence1_length': 1921, 'unique_sentence1': 875, 'min_sentence2_length': 76, 'average_sentence2_length': 332.08, 'max_sentence2_length': 1948, 'unique_sentence2': 875}, 'ces-lit': {'num_samples': 1002, 'number_of_characters': 674361, 'unique_pairs': 1002, 'min_sentence1_length': 73, 'average_sentence1_length': 328.37, 'max_sentence1_length': 2956, 'unique_sentence1': 1002, 'min_sentence2_length': 84, 'average_sentence2_length': 344.64, 'max_sentence2_length': 2995, 'unique_sentence2': 1002}, 'ces-pol': {'num_samples': 3367, 'number_of_characters': 2230397, 'unique_pairs': 3367, 'min_sentence1_length': 56, 'average_sentence1_length': 317.49, 'max_sentence1_length': 2453, 'unique_sentence1': 3367, 'min_sentence2_length': 64, 'average_sentence2_length': 344.94, 'max_sentence2_length': 2621, 'unique_sentence2': 3367}, 'ces-rus': {'num_samples': 2144, 'number_of_characters': 1438311, 'unique_pairs': 2144, 'min_sentence1_length': 56, 'average_sentence1_length': 319.4, 'max_sentence1_length': 2349, 'unique_sentence1': 2144, 'min_sentence2_length': 71, 'average_sentence2_length': 351.45, 'max_sentence2_length': 2509, 'unique_sentence2': 2144}, 'ces-slk': {'num_samples': 2551, 'number_of_characters': 1733126, 'unique_pairs': 2551, 'min_sentence1_length': 65, 'average_sentence1_length': 334.9, 'max_sentence1_length': 7967, 'unique_sentence1': 2551, 'min_sentence2_length': 67, 'average_sentence2_length': 344.49, 'max_sentence2_length': 10365, 'unique_sentence2': 2551}, 'ces-slv': {'num_samples': 1370, 'number_of_characters': 848116, 'unique_pairs': 1370, 'min_sentence1_length': 54, 'average_sentence1_length': 304.5, 'max_sentence1_length': 2956, 'unique_sentence1': 1370, 'min_sentence2_length': 59, 'average_sentence2_length': 314.56, 'max_sentence2_length': 3006, 'unique_sentence2': 1370}, 'ces-srp': {'num_samples': 362, 'number_of_characters': 238713, 'unique_pairs': 362, 'min_sentence1_length': 77, 'average_sentence1_length': 322.69, 'max_sentence1_length': 1921, 'unique_sentence1': 362, 'min_sentence2_length': 80, 'average_sentence2_length': 336.74, 'max_sentence2_length': 1861, 'unique_sentence2': 362}, 'ces-ukr': {'num_samples': 1285, 'number_of_characters': 789567, 'unique_pairs': 1285, 'min_sentence1_length': 56, 'average_sentence1_length': 295.05, 'max_sentence1_length': 1921, 'unique_sentence1': 1285, 'min_sentence2_length': 69, 'average_sentence2_length': 319.4, 'max_sentence2_length': 2042, 'unique_sentence2': 1285}, 'hrv-slk': {'num_samples': 313, 'number_of_characters': 184033, 'unique_pairs': 313, 'min_sentence1_length': 112, 'average_sentence1_length': 295.81, 'max_sentence1_length': 1393, 'unique_sentence1': 313, 'min_sentence2_length': 104, 'average_sentence2_length': 292.16, 'max_sentence2_length': 1411, 'unique_sentence2': 313}, 'kat-rus': {'num_samples': 262, 'number_of_characters': 190050, 'unique_pairs': 262, 'min_sentence1_length': 68, 'average_sentence1_length': 362.44, 'max_sentence1_length': 2879, 'unique_sentence1': 262, 'min_sentence2_length': 74, 'average_sentence2_length': 362.95, 'max_sentence2_length': 3069, 'unique_sentence2': 262}, 'lav-lit': {'num_samples': 1061, 'number_of_characters': 794243, 'unique_pairs': 1061, 'min_sentence1_length': 61, 'average_sentence1_length': 372.32, 'max_sentence1_length': 2410, 'unique_sentence1': 1061, 'min_sentence2_length': 67, 'average_sentence2_length': 376.26, 'max_sentence2_length': 2463, 'unique_sentence2': 1061}, 'lav-pol': {'num_samples': 951, 'number_of_characters': 701354, 'unique_pairs': 951, 'min_sentence1_length': 63, 'average_sentence1_length': 359.69, 'max_sentence1_length': 2044, 'unique_sentence1': 951, 'min_sentence2_length': 72, 'average_sentence2_length': 377.8, 'max_sentence2_length': 2234, 'unique_sentence2': 951}, 'lav-rus': {'num_samples': 1412, 'number_of_characters': 1039535, 'unique_pairs': 1412, 'min_sentence1_length': 61, 'average_sentence1_length': 358.04, 'max_sentence1_length': 2206, 'unique_sentence1': 1412, 'min_sentence2_length': 63, 'average_sentence2_length': 378.18, 'max_sentence2_length': 2383, 'unique_sentence2': 1412}, 'lav-slk': {'num_samples': 789, 'number_of_characters': 535091, 'unique_pairs': 789, 'min_sentence1_length': 75, 'average_sentence1_length': 342.33, 'max_sentence1_length': 1948, 'unique_sentence1': 789, 'min_sentence2_length': 68, 'average_sentence2_length': 335.86, 'max_sentence2_length': 1910, 'unique_sentence2': 789}, 'lav-slv': {'num_samples': 518, 'number_of_characters': 340127, 'unique_pairs': 518, 'min_sentence1_length': 76, 'average_sentence1_length': 329.34, 'max_sentence1_length': 1948, 'unique_sentence1': 518, 'min_sentence2_length': 71, 'average_sentence2_length': 327.28, 'max_sentence2_length': 1872, 'unique_sentence2': 518}, 'lav-ukr': {'num_samples': 579, 'number_of_characters': 428022, 'unique_pairs': 579, 'min_sentence1_length': 61, 'average_sentence1_length': 365.16, 'max_sentence1_length': 2410, 'unique_sentence1': 579, 'min_sentence2_length': 59, 'average_sentence2_length': 374.08, 'max_sentence2_length': 2412, 'unique_sentence2': 579}, 'lit-pol': {'num_samples': 1026, 'number_of_characters': 767128, 'unique_pairs': 1026, 'min_sentence1_length': 64, 'average_sentence1_length': 366.06, 'max_sentence1_length': 1990, 'unique_sentence1': 1026, 'min_sentence2_length': 79, 'average_sentence2_length': 381.63, 'max_sentence2_length': 2234, 'unique_sentence2': 1026}, 'lit-rus': {'num_samples': 961, 'number_of_characters': 744509, 'unique_pairs': 961, 'min_sentence1_length': 67, 'average_sentence1_length': 379.47, 'max_sentence1_length': 3141, 'unique_sentence1': 961, 'min_sentence2_length': 63, 'average_sentence2_length': 395.26, 'max_sentence2_length': 2201, 'unique_sentence2': 961}, 'lit-slk': {'num_samples': 859, 'number_of_characters': 583451, 'unique_pairs': 859, 'min_sentence1_length': 74, 'average_sentence1_length': 344.75, 'max_sentence1_length': 1925, 'unique_sentence1': 859, 'min_sentence2_length': 68, 'average_sentence2_length': 334.47, 'max_sentence2_length': 1961, 'unique_sentence2': 859}, 'lit-slv': {'num_samples': 607, 'number_of_characters': 438866, 'unique_pairs': 607, 'min_sentence1_length': 70, 'average_sentence1_length': 366.02, 'max_sentence1_length': 2995, 'unique_sentence1': 607, 'min_sentence2_length': 75, 'average_sentence2_length': 356.99, 'max_sentence2_length': 3006, 'unique_sentence2': 607}, 'lit-ukr': {'num_samples': 639, 'number_of_characters': 463616, 'unique_pairs': 639, 'min_sentence1_length': 67, 'average_sentence1_length': 361.16, 'max_sentence1_length': 2463, 'unique_sentence1': 639, 'min_sentence2_length': 59, 'average_sentence2_length': 364.37, 'max_sentence2_length': 2412, 'unique_sentence2': 639}, 'pol-rus': {'num_samples': 5014, 'number_of_characters': 3850186, 'unique_pairs': 5014, 'min_sentence1_length': 60, 'average_sentence1_length': 380.93, 'max_sentence1_length': 5103, 'unique_sentence1': 5014, 'min_sentence2_length': 59, 'average_sentence2_length': 386.95, 'max_sentence2_length': 4888, 'unique_sentence2': 5014}, 'pol-slk': {'num_samples': 1918, 'number_of_characters': 1321855, 'unique_pairs': 1918, 'min_sentence1_length': 71, 'average_sentence1_length': 354.28, 'max_sentence1_length': 5103, 'unique_sentence1': 1918, 'min_sentence2_length': 67, 'average_sentence2_length': 334.9, 'max_sentence2_length': 4641, 'unique_sentence2': 1918}, 'pol-slv': {'num_samples': 1382, 'number_of_characters': 859222, 'unique_pairs': 1382, 'min_sentence1_length': 76, 'average_sentence1_length': 317.49, 'max_sentence1_length': 2101, 'unique_sentence1': 1382, 'min_sentence2_length': 75, 'average_sentence2_length': 304.23, 'max_sentence2_length': 2015, 'unique_sentence2': 1382}, 'pol-srp': {'num_samples': 492, 'number_of_characters': 350413, 'unique_pairs': 492, 'min_sentence1_length': 82, 'average_sentence1_length': 357.26, 'max_sentence1_length': 1902, 'unique_sentence1': 492, 'min_sentence2_length': 78, 'average_sentence2_length': 354.96, 'max_sentence2_length': 1845, 'unique_sentence2': 492}, 'pol-ukr': {'num_samples': 2370, 'number_of_characters': 1753652, 'unique_pairs': 2370, 'min_sentence1_length': 59, 'average_sentence1_length': 373.9, 'max_sentence1_length': 3106, 'unique_sentence1': 2370, 'min_sentence2_length': 61, 'average_sentence2_length': 366.04, 'max_sentence2_length': 2827, 'unique_sentence2': 2370}, 'rus-slk': {'num_samples': 1263, 'number_of_characters': 905526, 'unique_pairs': 1263, 'min_sentence1_length': 69, 'average_sentence1_length': 371.39, 'max_sentence1_length': 4888, 'unique_sentence1': 1263, 'min_sentence2_length': 67, 'average_sentence2_length': 345.58, 'max_sentence2_length': 4641, 'unique_sentence2': 1263}, 'rus-slv': {'num_samples': 1096, 'number_of_characters': 719013, 'unique_pairs': 1096, 'min_sentence1_length': 84, 'average_sentence1_length': 341.51, 'max_sentence1_length': 2164, 'unique_sentence1': 1096, 'min_sentence2_length': 71, 'average_sentence2_length': 314.52, 'max_sentence2_length': 2015, 'unique_sentence2': 1096}, 'rus-srp': {'num_samples': 455, 'number_of_characters': 341619, 'unique_pairs': 455, 'min_sentence1_length': 92, 'average_sentence1_length': 386.63, 'max_sentence1_length': 1921, 'unique_sentence1': 455, 'min_sentence2_length': 90, 'average_sentence2_length': 364.18, 'max_sentence2_length': 1845, 'unique_sentence2': 455}, 'rus-ukr': {'num_samples': 15251, 'number_of_characters': 10782282, 'unique_pairs': 15251, 'min_sentence1_length': 55, 'average_sentence1_length': 358.27, 'max_sentence1_length': 3905, 'unique_sentence1': 15251, 'min_sentence2_length': 49, 'average_sentence2_length': 348.72, 'max_sentence2_length': 3801, 'unique_sentence2': 15251}, 'slk-slv': {'num_samples': 1259, 'number_of_characters': 852109, 'unique_pairs': 1259, 'min_sentence1_length': 68, 'average_sentence1_length': 338.08, 'max_sentence1_length': 1961, 'unique_sentence1': 1259, 'min_sentence2_length': 71, 'average_sentence2_length': 338.74, 'max_sentence2_length': 1872, 'unique_sentence2': 1259}, 'slk-srp': {'num_samples': 561, 'number_of_characters': 493396, 'unique_pairs': 561, 'min_sentence1_length': 82, 'average_sentence1_length': 431.13, 'max_sentence1_length': 1910, 'unique_sentence1': 561, 'min_sentence2_length': 80, 'average_sentence2_length': 448.36, 'max_sentence2_length': 1845, 'unique_sentence2': 561}, 'slk-ukr': {'num_samples': 944, 'number_of_characters': 608143, 'unique_pairs': 944, 'min_sentence1_length': 68, 'average_sentence1_length': 314.41, 'max_sentence1_length': 1910, 'unique_sentence1': 944, 'min_sentence2_length': 69, 'average_sentence2_length': 329.81, 'max_sentence2_length': 1923, 'unique_sentence2': 944}, 'slv-srp': {'num_samples': 499, 'number_of_characters': 378293, 'unique_pairs': 499, 'min_sentence1_length': 80, 'average_sentence1_length': 374.47, 'max_sentence1_length': 2476, 'unique_sentence1': 499, 'min_sentence2_length': 80, 'average_sentence2_length': 383.63, 'max_sentence2_length': 2387, 'unique_sentence2': 499}, 'slv-ukr': {'num_samples': 733, 'number_of_characters': 431361, 'unique_pairs': 733, 'min_sentence1_length': 71, 'average_sentence1_length': 286.64, 'max_sentence1_length': 1872, 'unique_sentence1': 733, 'min_sentence2_length': 69, 'average_sentence2_length': 301.85, 'max_sentence2_length': 1923, 'unique_sentence2': 733}, 'cat-deu': {'num_samples': 302, 'number_of_characters': 279459, 'unique_pairs': 302, 'min_sentence1_length': 66, 'average_sentence1_length': 451.95, 'max_sentence1_length': 2056, 'unique_sentence1': 302, 'min_sentence2_length': 77, 'average_sentence2_length': 473.41, 'max_sentence2_length': 2082, 'unique_sentence2': 302}, 'cat-fra': {'num_samples': 598, 'number_of_characters': 476709, 'unique_pairs': 598, 'min_sentence1_length': 62, 'average_sentence1_length': 381.68, 'max_sentence1_length': 2056, 'unique_sentence1': 598, 'min_sentence2_length': 74, 'average_sentence2_length': 415.5, 'max_sentence2_length': 2277, 'unique_sentence2': 598}, 'cat-ita': {'num_samples': 418, 'number_of_characters': 327132, 'unique_pairs': 418, 'min_sentence1_length': 60, 'average_sentence1_length': 388.8, 'max_sentence1_length': 2056, 'unique_sentence1': 418, 'min_sentence2_length': 55, 'average_sentence2_length': 393.81, 'max_sentence2_length': 2186, 'unique_sentence2': 418}, 'cat-por': {'num_samples': 370, 'number_of_characters': 248938, 'unique_pairs': 370, 'min_sentence1_length': 58, 'average_sentence1_length': 338.32, 'max_sentence1_length': 2056, 'unique_sentence1': 370, 'min_sentence2_length': 60, 'average_sentence2_length': 334.48, 'max_sentence2_length': 2088, 'unique_sentence2': 370}, 'cat-spa': {'num_samples': 2648, 'number_of_characters': 2308040, 'unique_pairs': 2648, 'min_sentence1_length': 62, 'average_sentence1_length': 430.96, 'max_sentence1_length': 8113, 'unique_sentence1': 2648, 'min_sentence2_length': 65, 'average_sentence2_length': 440.66, 'max_sentence2_length': 8345, 'unique_sentence2': 2648}, 'dan-deu': {'num_samples': 4337, 'number_of_characters': 3443148, 'unique_pairs': 4337, 'min_sentence1_length': 60, 'average_sentence1_length': 372.7, 'max_sentence1_length': 4236, 'unique_sentence1': 4337, 'min_sentence2_length': 69, 'average_sentence2_length': 421.21, 'max_sentence2_length': 4093, 'unique_sentence2': 4337}, 'dan-fra': {'num_samples': 3802, 'number_of_characters': 3021023, 'unique_pairs': 3802, 'min_sentence1_length': 62, 'average_sentence1_length': 365.2, 'max_sentence1_length': 4236, 'unique_sentence1': 3802, 'min_sentence2_length': 70, 'average_sentence2_length': 429.39, 'max_sentence2_length': 4717, 'unique_sentence2': 3802}, 'dan-isl': {'num_samples': 327, 'number_of_characters': 230870, 'unique_pairs': 327, 'min_sentence1_length': 81, 'average_sentence1_length': 357.51, 'max_sentence1_length': 1856, 'unique_sentence1': 327, 'min_sentence2_length': 82, 'average_sentence2_length': 348.51, 'max_sentence2_length': 1897, 'unique_sentence2': 327}, 'dan-ita': {'num_samples': 3818, 'number_of_characters': 2976506, 'unique_pairs': 3818, 'min_sentence1_length': 62, 'average_sentence1_length': 371.54, 'max_sentence1_length': 4236, 'unique_sentence1': 3818, 'min_sentence2_length': 64, 'average_sentence2_length': 408.06, 'max_sentence2_length': 4574, 'unique_sentence2': 3818}, 'dan-nld': {'num_samples': 4099, 'number_of_characters': 3047816, 'unique_pairs': 4099, 'min_sentence1_length': 68, 'average_sentence1_length': 360.12, 'max_sentence1_length': 4236, 'unique_sentence1': 4099, 'min_sentence2_length': 63, 'average_sentence2_length': 383.43, 'max_sentence2_length': 4431, 'unique_sentence2': 4099}, 'dan-nor': {'num_samples': 2603, 'number_of_characters': 1873194, 'unique_pairs': 2603, 'min_sentence1_length': 62, 'average_sentence1_length': 365.46, 'max_sentence1_length': 3505, 'unique_sentence1': 2603, 'min_sentence2_length': 59, 'average_sentence2_length': 354.17, 'max_sentence2_length': 3400, 'unique_sentence2': 2603}, 'dan-por': {'num_samples': 3206, 'number_of_characters': 2344257, 'unique_pairs': 3206, 'min_sentence1_length': 60, 'average_sentence1_length': 353.3, 'max_sentence1_length': 3843, 'unique_sentence1': 3206, 'min_sentence2_length': 65, 'average_sentence2_length': 377.91, 'max_sentence2_length': 3799, 'unique_sentence2': 3206}, 'dan-ron': {'num_samples': 2052, 'number_of_characters': 1446475, 'unique_pairs': 2052, 'min_sentence1_length': 67, 'average_sentence1_length': 336.33, 'max_sentence1_length': 4236, 'unique_sentence1': 2052, 'min_sentence2_length': 63, 'average_sentence2_length': 368.58, 'max_sentence2_length': 4285, 'unique_sentence2': 2052}, 'dan-spa': {'num_samples': 3571, 'number_of_characters': 2720999, 'unique_pairs': 3571, 'min_sentence1_length': 60, 'average_sentence1_length': 360.46, 'max_sentence1_length': 4236, 'unique_sentence1': 3571, 'min_sentence2_length': 65, 'average_sentence2_length': 401.51, 'max_sentence2_length': 4498, 'unique_sentence2': 3571}, 'dan-swe': {'num_samples': 4268, 'number_of_characters': 3115686, 'unique_pairs': 4268, 'min_sentence1_length': 63, 'average_sentence1_length': 368.21, 'max_sentence1_length': 15317, 'unique_sentence1': 4268, 'min_sentence2_length': 64, 'average_sentence2_length': 361.8, 'max_sentence2_length': 15427, 'unique_sentence2': 4268}, 'deu-fra': {'num_samples': 27727, 'number_of_characters': 24341503, 'unique_pairs': 27727, 'min_sentence1_length': 52, 'average_sentence1_length': 429.85, 'max_sentence1_length': 10595, 'unique_sentence1': 27727, 'min_sentence2_length': 60, 'average_sentence2_length': 448.05, 'max_sentence2_length': 11165, 'unique_sentence2': 27727}, 'deu-isl': {'num_samples': 294, 'number_of_characters': 221411, 'unique_pairs': 294, 'min_sentence1_length': 84, 'average_sentence1_length': 403.22, 'max_sentence1_length': 2082, 'unique_sentence1': 294, 'min_sentence2_length': 79, 'average_sentence2_length': 349.88, 'max_sentence2_length': 1897, 'unique_sentence2': 294}, 'deu-ita': {'num_samples': 18787, 'number_of_characters': 15815694, 'unique_pairs': 18787, 'min_sentence1_length': 52, 'average_sentence1_length': 424.82, 'max_sentence1_length': 7136, 'unique_sentence1': 18787, 'min_sentence2_length': 56, 'average_sentence2_length': 417.02, 'max_sentence2_length': 6634, 'unique_sentence2': 18787}, 'deu-nld': {'num_samples': 14211, 'number_of_characters': 11610783, 'unique_pairs': 14211, 'min_sentence1_length': 52, 'average_sentence1_length': 418.9, 'max_sentence1_length': 4919, 'unique_sentence1': 14211, 'min_sentence2_length': 62, 'average_sentence2_length': 398.13, 'max_sentence2_length': 4779, 'unique_sentence2': 14211}, 'deu-nor': {'num_samples': 2783, 'number_of_characters': 2150067, 'unique_pairs': 2783, 'min_sentence1_length': 63, 'average_sentence1_length': 413.82, 'max_sentence1_length': 18410, 'unique_sentence1': 2783, 'min_sentence2_length': 63, 'average_sentence2_length': 358.75, 'max_sentence2_length': 16149, 'unique_sentence2': 2783}, 'deu-por': {'num_samples': 11319, 'number_of_characters': 9085897, 'unique_pairs': 11319, 'min_sentence1_length': 63, 'average_sentence1_length': 412.04, 'max_sentence1_length': 7136, 'unique_sentence1': 11319, 'min_sentence2_length': 59, 'average_sentence2_length': 390.67, 'max_sentence2_length': 6536, 'unique_sentence2': 11319}, 'deu-ron': {'num_samples': 3598, 'number_of_characters': 2755112, 'unique_pairs': 3598, 'min_sentence1_length': 61, 'average_sentence1_length': 387.56, 'max_sentence1_length': 4093, 'unique_sentence1': 3598, 'min_sentence2_length': 55, 'average_sentence2_length': 378.17, 'max_sentence2_length': 4285, 'unique_sentence2': 3598}, 'deu-spa': {'num_samples': 19739, 'number_of_characters': 16855942, 'unique_pairs': 19739, 'min_sentence1_length': 60, 'average_sentence1_length': 430.84, 'max_sentence1_length': 7136, 'unique_sentence1': 19739, 'min_sentence2_length': 55, 'average_sentence2_length': 423.1, 'max_sentence2_length': 6963, 'unique_sentence2': 19739}, 'deu-swe': {'num_samples': 5772, 'number_of_characters': 4469906, 'unique_pairs': 5772, 'min_sentence1_length': 59, 'average_sentence1_length': 412.41, 'max_sentence1_length': 4093, 'unique_sentence1': 5772, 'min_sentence2_length': 57, 'average_sentence2_length': 362.0, 'max_sentence2_length': 4038, 'unique_sentence2': 5772}, 'fra-isl': {'num_samples': 347, 'number_of_characters': 256923, 'unique_pairs': 347, 'min_sentence1_length': 76, 'average_sentence1_length': 400.64, 'max_sentence1_length': 2277, 'unique_sentence1': 347, 'min_sentence2_length': 75, 'average_sentence2_length': 339.77, 'max_sentence2_length': 1897, 'unique_sentence2': 347}, 'fra-ita': {'num_samples': 20002, 'number_of_characters': 17269559, 'unique_pairs': 20002, 'min_sentence1_length': 62, 'average_sentence1_length': 444.94, 'max_sentence1_length': 7253, 'unique_sentence1': 20002, 'min_sentence2_length': 49, 'average_sentence2_length': 418.46, 'max_sentence2_length': 6634, 'unique_sentence2': 20002}, 'fra-nld': {'num_samples': 14684, 'number_of_characters': 12784405, 'unique_pairs': 14684, 'min_sentence1_length': 60, 'average_sentence1_length': 455.31, 'max_sentence1_length': 9107, 'unique_sentence1': 14684, 'min_sentence2_length': 51, 'average_sentence2_length': 415.33, 'max_sentence2_length': 8534, 'unique_sentence2': 14684}, 'fra-nor': {'num_samples': 2558, 'number_of_characters': 1963253, 'unique_pairs': 2558, 'min_sentence1_length': 86, 'average_sentence1_length': 421.16, 'max_sentence1_length': 4086, 'unique_sentence1': 2558, 'min_sentence2_length': 64, 'average_sentence2_length': 346.33, 'max_sentence2_length': 3400, 'unique_sentence2': 2558}, 'fra-por': {'num_samples': 13265, 'number_of_characters': 10942625, 'unique_pairs': 13265, 'min_sentence1_length': 61, 'average_sentence1_length': 431.68, 'max_sentence1_length': 7253, 'unique_sentence1': 13265, 'min_sentence2_length': 55, 'average_sentence2_length': 393.24, 'max_sentence2_length': 6536, 'unique_sentence2': 13265}, 'fra-ron': {'num_samples': 3295, 'number_of_characters': 2448321, 'unique_pairs': 3295, 'min_sentence1_length': 70, 'average_sentence1_length': 385.29, 'max_sentence1_length': 4717, 'unique_sentence1': 3295, 'min_sentence2_length': 63, 'average_sentence2_length': 357.75, 'max_sentence2_length': 4285, 'unique_sentence2': 3295}, 'fra-spa': {'num_samples': 23311, 'number_of_characters': 20477763, 'unique_pairs': 23311, 'min_sentence1_length': 54, 'average_sentence1_length': 451.84, 'max_sentence1_length': 14943, 'unique_sentence1': 23311, 'min_sentence2_length': 50, 'average_sentence2_length': 426.62, 'max_sentence2_length': 13323, 'unique_sentence2': 23311}, 'fra-swe': {'num_samples': 5006, 'number_of_characters': 3880565, 'unique_pairs': 5006, 'min_sentence1_length': 70, 'average_sentence1_length': 421.95, 'max_sentence1_length': 4717, 'unique_sentence1': 5006, 'min_sentence2_length': 59, 'average_sentence2_length': 353.23, 'max_sentence2_length': 4038, 'unique_sentence2': 5006}, 'isl-ita': {'num_samples': 421, 'number_of_characters': 310158, 'unique_pairs': 421, 'min_sentence1_length': 75, 'average_sentence1_length': 350.33, 'max_sentence1_length': 1897, 'unique_sentence1': 421, 'min_sentence2_length': 74, 'average_sentence2_length': 386.39, 'max_sentence2_length': 2186, 'unique_sentence2': 421}, 'isl-nld': {'num_samples': 311, 'number_of_characters': 228026, 'unique_pairs': 311, 'min_sentence1_length': 123, 'average_sentence1_length': 350.56, 'max_sentence1_length': 1705, 'unique_sentence1': 311, 'min_sentence2_length': 125, 'average_sentence2_length': 382.64, 'max_sentence2_length': 2142, 'unique_sentence2': 311}, 'isl-por': {'num_samples': 341, 'number_of_characters': 260057, 'unique_pairs': 341, 'min_sentence1_length': 110, 'average_sentence1_length': 367.62, 'max_sentence1_length': 1897, 'unique_sentence1': 341, 'min_sentence2_length': 128, 'average_sentence2_length': 395.01, 'max_sentence2_length': 2088, 'unique_sentence2': 341}, 'isl-spa': {'num_samples': 366, 'number_of_characters': 262085, 'unique_pairs': 366, 'min_sentence1_length': 75, 'average_sentence1_length': 336.95, 'max_sentence1_length': 1517, 'unique_sentence1': 366, 'min_sentence2_length': 73, 'average_sentence2_length': 379.13, 'max_sentence2_length': 1601, 'unique_sentence2': 366}, 'isl-swe': {'num_samples': 312, 'number_of_characters': 214179, 'unique_pairs': 312, 'min_sentence1_length': 75, 'average_sentence1_length': 342.19, 'max_sentence1_length': 1897, 'unique_sentence1': 312, 'min_sentence2_length': 78, 'average_sentence2_length': 344.28, 'max_sentence2_length': 1841, 'unique_sentence2': 312}, 'ita-nld': {'num_samples': 9160, 'number_of_characters': 7462623, 'unique_pairs': 9160, 'min_sentence1_length': 64, 'average_sentence1_length': 413.37, 'max_sentence1_length': 16311, 'unique_sentence1': 9160, 'min_sentence2_length': 62, 'average_sentence2_length': 401.33, 'max_sentence2_length': 15855, 'unique_sentence2': 9160}, 'ita-nor': {'num_samples': 2516, 'number_of_characters': 1877016, 'unique_pairs': 2516, 'min_sentence1_length': 88, 'average_sentence1_length': 395.47, 'max_sentence1_length': 3906, 'unique_sentence1': 2516, 'min_sentence2_length': 80, 'average_sentence2_length': 350.57, 'max_sentence2_length': 3030, 'unique_sentence2': 2516}, 'ita-por': {'num_samples': 10924, 'number_of_characters': 8751330, 'unique_pairs': 10924, 'min_sentence1_length': 55, 'average_sentence1_length': 406.05, 'max_sentence1_length': 16311, 'unique_sentence1': 10924, 'min_sentence2_length': 55, 'average_sentence2_length': 395.06, 'max_sentence2_length': 16230, 'unique_sentence2': 10924}, 'ita-ron': {'num_samples': 3360, 'number_of_characters': 2417807, 'unique_pairs': 3360, 'min_sentence1_length': 63, 'average_sentence1_length': 360.05, 'max_sentence1_length': 6226, 'unique_sentence1': 3360, 'min_sentence2_length': 65, 'average_sentence2_length': 359.53, 'max_sentence2_length': 6571, 'unique_sentence2': 3360}, 'ita-spa': {'num_samples': 16534, 'number_of_characters': 14167503, 'unique_pairs': 16534, 'min_sentence1_length': 49, 'average_sentence1_length': 426.64, 'max_sentence1_length': 16311, 'unique_sentence1': 16534, 'min_sentence2_length': 50, 'average_sentence2_length': 430.23, 'max_sentence2_length': 16655, 'unique_sentence2': 16534}, 'ita-swe': {'num_samples': 4741, 'number_of_characters': 3557838, 'unique_pairs': 4741, 'min_sentence1_length': 64, 'average_sentence1_length': 394.97, 'max_sentence1_length': 16311, 'unique_sentence1': 4741, 'min_sentence2_length': 57, 'average_sentence2_length': 355.47, 'max_sentence2_length': 15020, 'unique_sentence2': 4741}, 'nld-nor': {'num_samples': 2664, 'number_of_characters': 1941142, 'unique_pairs': 2664, 'min_sentence1_length': 72, 'average_sentence1_length': 380.63, 'max_sentence1_length': 3967, 'unique_sentence1': 2664, 'min_sentence2_length': 68, 'average_sentence2_length': 348.03, 'max_sentence2_length': 3400, 'unique_sentence2': 2664}, 'nld-por': {'num_samples': 7021, 'number_of_characters': 5347190, 'unique_pairs': 7021, 'min_sentence1_length': 51, 'average_sentence1_length': 380.22, 'max_sentence1_length': 15855, 'unique_sentence1': 7021, 'min_sentence2_length': 56, 'average_sentence2_length': 381.38, 'max_sentence2_length': 16230, 'unique_sentence2': 7021}, 'nld-ron': {'num_samples': 2888, 'number_of_characters': 2001437, 'unique_pairs': 2888, 'min_sentence1_length': 60, 'average_sentence1_length': 340.69, 'max_sentence1_length': 4431, 'unique_sentence1': 2888, 'min_sentence2_length': 70, 'average_sentence2_length': 352.33, 'max_sentence2_length': 4285, 'unique_sentence2': 2888}, 'nld-spa': {'num_samples': 9555, 'number_of_characters': 7861048, 'unique_pairs': 9555, 'min_sentence1_length': 62, 'average_sentence1_length': 403.95, 'max_sentence1_length': 15855, 'unique_sentence1': 9555, 'min_sentence2_length': 55, 'average_sentence2_length': 418.76, 'max_sentence2_length': 16655, 'unique_sentence2': 9555}, 'nld-swe': {'num_samples': 5072, 'number_of_characters': 3727392, 'unique_pairs': 5072, 'min_sentence1_length': 65, 'average_sentence1_length': 381.69, 'max_sentence1_length': 15855, 'unique_sentence1': 5072, 'min_sentence2_length': 56, 'average_sentence2_length': 353.2, 'max_sentence2_length': 15020, 'unique_sentence2': 5072}, 'nor-por': {'num_samples': 2096, 'number_of_characters': 1461412, 'unique_pairs': 2096, 'min_sentence1_length': 68, 'average_sentence1_length': 331.47, 'max_sentence1_length': 3400, 'unique_sentence1': 2096, 'min_sentence2_length': 61, 'average_sentence2_length': 365.77, 'max_sentence2_length': 3784, 'unique_sentence2': 2096}, 'nor-ron': {'num_samples': 1412, 'number_of_characters': 972154, 'unique_pairs': 1412, 'min_sentence1_length': 78, 'average_sentence1_length': 324.01, 'max_sentence1_length': 1884, 'unique_sentence1': 1412, 'min_sentence2_length': 75, 'average_sentence2_length': 364.48, 'max_sentence2_length': 2196, 'unique_sentence2': 1412}, 'nor-spa': {'num_samples': 2603, 'number_of_characters': 1933198, 'unique_pairs': 2603, 'min_sentence1_length': 63, 'average_sentence1_length': 347.56, 'max_sentence1_length': 3030, 'unique_sentence1': 2603, 'min_sentence2_length': 74, 'average_sentence2_length': 395.12, 'max_sentence2_length': 3847, 'unique_sentence2': 2603}, 'nor-swe': {'num_samples': 3165, 'number_of_characters': 2305135, 'unique_pairs': 3165, 'min_sentence1_length': 66, 'average_sentence1_length': 360.99, 'max_sentence1_length': 2366, 'unique_sentence1': 3165, 'min_sentence2_length': 70, 'average_sentence2_length': 367.33, 'max_sentence2_length': 2340, 'unique_sentence2': 3165}, 'por-ron': {'num_samples': 3026, 'number_of_characters': 2086079, 'unique_pairs': 3026, 'min_sentence1_length': 71, 'average_sentence1_length': 340.88, 'max_sentence1_length': 4439, 'unique_sentence1': 3026, 'min_sentence2_length': 63, 'average_sentence2_length': 348.51, 'max_sentence2_length': 3274, 'unique_sentence2': 3026}, 'por-spa': {'num_samples': 16084, 'number_of_characters': 12835938, 'unique_pairs': 16084, 'min_sentence1_length': 51, 'average_sentence1_length': 391.86, 'max_sentence1_length': 16230, 'unique_sentence1': 16084, 'min_sentence2_length': 54, 'average_sentence2_length': 406.2, 'max_sentence2_length': 16655, 'unique_sentence2': 16084}, 'por-swe': {'num_samples': 4235, 'number_of_characters': 2994503, 'unique_pairs': 4235, 'min_sentence1_length': 62, 'average_sentence1_length': 367.66, 'max_sentence1_length': 16230, 'unique_sentence1': 4235, 'min_sentence2_length': 57, 'average_sentence2_length': 339.43, 'max_sentence2_length': 15020, 'unique_sentence2': 4235}, 'ron-spa': {'num_samples': 3375, 'number_of_characters': 2415347, 'unique_pairs': 3375, 'min_sentence1_length': 73, 'average_sentence1_length': 355.39, 'max_sentence1_length': 4285, 'unique_sentence1': 3375, 'min_sentence2_length': 70, 'average_sentence2_length': 360.27, 'max_sentence2_length': 4498, 'unique_sentence2': 3375}, 'ron-swe': {'num_samples': 2154, 'number_of_characters': 1454257, 'unique_pairs': 2154, 'min_sentence1_length': 63, 'average_sentence1_length': 354.66, 'max_sentence1_length': 4285, 'unique_sentence1': 2154, 'min_sentence2_length': 63, 'average_sentence2_length': 320.48, 'max_sentence2_length': 4038, 'unique_sentence2': 2154}, 'spa-swe': {'num_samples': 4884, 'number_of_characters': 3751782, 'unique_pairs': 4884, 'min_sentence1_length': 66, 'average_sentence1_length': 406.52, 'max_sentence1_length': 16655, 'unique_sentence1': 4884, 'min_sentence2_length': 62, 'average_sentence2_length': 361.66, 'max_sentence2_length': 15020, 'unique_sentence2': 4884}, 'ben-hin': {'num_samples': 1174, 'number_of_characters': 682915, 'unique_pairs': 1174, 'min_sentence1_length': 64, 'average_sentence1_length': 287.33, 'max_sentence1_length': 1957, 'unique_sentence1': 1174, 'min_sentence2_length': 50, 'average_sentence2_length': 294.37, 'max_sentence2_length': 1980, 'unique_sentence2': 1174}, 'ben-mar': {'num_samples': 566, 'number_of_characters': 305353, 'unique_pairs': 566, 'min_sentence1_length': 50, 'average_sentence1_length': 271.83, 'max_sentence1_length': 1753, 'unique_sentence1': 566, 'min_sentence2_length': 57, 'average_sentence2_length': 267.66, 'max_sentence2_length': 1780, 'unique_sentence2': 566}, 'ben-urd': {'num_samples': 488, 'number_of_characters': 265698, 'unique_pairs': 488, 'min_sentence1_length': 61, 'average_sentence1_length': 269.23, 'max_sentence1_length': 1190, 'unique_sentence1': 488, 'min_sentence2_length': 62, 'average_sentence2_length': 275.23, 'max_sentence2_length': 1179, 'unique_sentence2': 488}, 'hin-mar': {'num_samples': 615, 'number_of_characters': 320880, 'unique_pairs': 615, 'min_sentence1_length': 58, 'average_sentence1_length': 265.89, 'max_sentence1_length': 1769, 'unique_sentence1': 615, 'min_sentence2_length': 58, 'average_sentence2_length': 255.87, 'max_sentence2_length': 1780, 'unique_sentence2': 615}, 'hin-urd': {'num_samples': 545, 'number_of_characters': 293939, 'unique_pairs': 545, 'min_sentence1_length': 63, 'average_sentence1_length': 271.89, 'max_sentence1_length': 1206, 'unique_sentence1': 545, 'min_sentence2_length': 62, 'average_sentence2_length': 267.45, 'max_sentence2_length': 1179, 'unique_sentence2': 545}, 'mar-urd': {'num_samples': 270, 'number_of_characters': 147706, 'unique_pairs': 270, 'min_sentence1_length': 63, 'average_sentence1_length': 270.9, 'max_sentence1_length': 1169, 'unique_sentence1': 270, 'min_sentence2_length': 66, 'average_sentence2_length': 276.16, 'max_sentence2_length': 1172, 'unique_sentence2': 270}, 'aze-kaz': {'num_samples': 412, 'number_of_characters': 230950, 'unique_pairs': 412, 'min_sentence1_length': 73, 'average_sentence1_length': 280.5, 'max_sentence1_length': 1824, 'unique_sentence1': 412, 'min_sentence2_length': 68, 'average_sentence2_length': 280.06, 'max_sentence2_length': 1855, 'unique_sentence2': 412}, 'aze-tur': {'num_samples': 388, 'number_of_characters': 205998, 'unique_pairs': 388, 'min_sentence1_length': 72, 'average_sentence1_length': 266.67, 'max_sentence1_length': 1824, 'unique_sentence1': 388, 'min_sentence2_length': 64, 'average_sentence2_length': 264.26, 'max_sentence2_length': 1838, 'unique_sentence2': 388}, 'kaz-tur': {'num_samples': 340, 'number_of_characters': 181572, 'unique_pairs': 340, 'min_sentence1_length': 68, 'average_sentence1_length': 267.44, 'max_sentence1_length': 1855, 'unique_sentence1': 340, 'min_sentence2_length': 64, 'average_sentence2_length': 266.59, 'max_sentence2_length': 1838, 'unique_sentence2': 340}, 'est-fin': {'num_samples': 790, 'number_of_characters': 551725, 'unique_pairs': 790, 'min_sentence1_length': 63, 'average_sentence1_length': 341.07, 'max_sentence1_length': 1829, 'unique_sentence1': 790, 'min_sentence2_length': 63, 'average_sentence2_length': 357.32, 'max_sentence2_length': 1815, 'unique_sentence2': 790}, 'est-hun': {'num_samples': 674, 'number_of_characters': 465779, 'unique_pairs': 674, 'min_sentence1_length': 72, 'average_sentence1_length': 329.88, 'max_sentence1_length': 2117, 'unique_sentence1': 674, 'min_sentence2_length': 73, 'average_sentence2_length': 361.18, 'max_sentence2_length': 2403, 'unique_sentence2': 674}, 'fin-hun': {'num_samples': 1542, 'number_of_characters': 1062678, 'unique_pairs': 1542, 'min_sentence1_length': 65, 'average_sentence1_length': 336.81, 'max_sentence1_length': 2500, 'unique_sentence1': 1542, 'min_sentence2_length': 66, 'average_sentence2_length': 352.35, 'max_sentence2_length': 2674, 'unique_sentence2': 1542}, 'ara-eng': {'num_samples': 5698, 'number_of_characters': 3662324, 'unique_pairs': 5698, 'min_sentence1_length': 53, 'average_sentence1_length': 302.59, 'max_sentence1_length': 2664, 'unique_sentence1': 5698, 'min_sentence2_length': 60, 'average_sentence2_length': 340.15, 'max_sentence2_length': 2811, 'unique_sentence2': 5698}, 'aze-eng': {'num_samples': 603, 'number_of_characters': 383857, 'unique_pairs': 603, 'min_sentence1_length': 58, 'average_sentence1_length': 323.52, 'max_sentence1_length': 1339, 'unique_sentence1': 603, 'min_sentence2_length': 69, 'average_sentence2_length': 313.05, 'max_sentence2_length': 1352, 'unique_sentence2': 603}, 'ben-eng': {'num_samples': 1367, 'number_of_characters': 866437, 'unique_pairs': 1367, 'min_sentence1_length': 50, 'average_sentence1_length': 318.67, 'max_sentence1_length': 1191, 'unique_sentence1': 1367, 'min_sentence2_length': 65, 'average_sentence2_length': 315.16, 'max_sentence2_length': 1094, 'unique_sentence2': 1367}, 'bul-eng': {'num_samples': 2133, 'number_of_characters': 1458078, 'unique_pairs': 2133, 'min_sentence1_length': 65, 'average_sentence1_length': 354.96, 'max_sentence1_length': 3016, 'unique_sentence1': 2133, 'min_sentence2_length': 65, 'average_sentence2_length': 328.62, 'max_sentence2_length': 2770, 'unique_sentence2': 2133}, 'cat-eng': {'num_samples': 1152, 'number_of_characters': 977495, 'unique_pairs': 1152, 'min_sentence1_length': 64, 'average_sentence1_length': 437.25, 'max_sentence1_length': 8113, 'unique_sentence1': 1152, 'min_sentence2_length': 59, 'average_sentence2_length': 411.27, 'max_sentence2_length': 7400, 'unique_sentence2': 1152}, 'ces-eng': {'num_samples': 3775, 'number_of_characters': 2651016, 'unique_pairs': 3775, 'min_sentence1_length': 54, 'average_sentence1_length': 347.59, 'max_sentence1_length': 2349, 'unique_sentence1': 3775, 'min_sentence2_length': 60, 'average_sentence2_length': 354.66, 'max_sentence2_length': 2401, 'unique_sentence2': 3775}, 'dan-eng': {'num_samples': 4512, 'number_of_characters': 3404958, 'unique_pairs': 4512, 'min_sentence1_length': 49, 'average_sentence1_length': 381.45, 'max_sentence1_length': 15317, 'unique_sentence1': 4512, 'min_sentence2_length': 56, 'average_sentence2_length': 373.19, 'max_sentence2_length': 14749, 'unique_sentence2': 4512}, 'deu-eng': {'num_samples': 37348, 'number_of_characters': 29811894, 'unique_pairs': 37348, 'min_sentence1_length': 53, 'average_sentence1_length': 423.63, 'max_sentence1_length': 6437, 'unique_sentence1': 37348, 'min_sentence2_length': 46, 'average_sentence2_length': 374.59, 'max_sentence2_length': 5781, 'unique_sentence2': 37348}, 'ell-eng': {'num_samples': 2790, 'number_of_characters': 2021300, 'unique_pairs': 2790, 'min_sentence1_length': 66, 'average_sentence1_length': 394.65, 'max_sentence1_length': 2963, 'unique_sentence1': 2790, 'min_sentence2_length': 61, 'average_sentence2_length': 329.83, 'max_sentence2_length': 3013, 'unique_sentence2': 2790}, 'eng-est': {'num_samples': 755, 'number_of_characters': 528417, 'unique_pairs': 755, 'min_sentence1_length': 58, 'average_sentence1_length': 352.58, 'max_sentence1_length': 1567, 'unique_sentence1': 755, 'min_sentence2_length': 62, 'average_sentence2_length': 347.31, 'max_sentence2_length': 1630, 'unique_sentence2': 755}, 'eng-fas': {'num_samples': 556, 'number_of_characters': 431685, 'unique_pairs': 556, 'min_sentence1_length': 59, 'average_sentence1_length': 396.65, 'max_sentence1_length': 5339, 'unique_sentence1': 556, 'min_sentence2_length': 68, 'average_sentence2_length': 379.76, 'max_sentence2_length': 4782, 'unique_sentence2': 556}, 'eng-fin': {'num_samples': 3443, 'number_of_characters': 2505517, 'unique_pairs': 3443, 'min_sentence1_length': 62, 'average_sentence1_length': 359.76, 'max_sentence1_length': 4412, 'unique_sentence1': 3443, 'min_sentence2_length': 57, 'average_sentence2_length': 367.96, 'max_sentence2_length': 4583, 'unique_sentence2': 3443}, 'eng-fra': {'num_samples': 37208, 'number_of_characters': 30609932, 'unique_pairs': 37208, 'min_sentence1_length': 52, 'average_sentence1_length': 375.68, 'max_sentence1_length': 14463, 'unique_sentence1': 37208, 'min_sentence2_length': 59, 'average_sentence2_length': 446.99, 'max_sentence2_length': 15312, 'unique_sentence2': 37208}, 'eng-heb': {'num_samples': 882, 'number_of_characters': 541517, 'unique_pairs': 882, 'min_sentence1_length': 80, 'average_sentence1_length': 339.58, 'max_sentence1_length': 16651, 'unique_sentence1': 882, 'min_sentence2_length': 62, 'average_sentence2_length': 274.39, 'max_sentence2_length': 14483, 'unique_sentence2': 882}, 'eng-hin': {'num_samples': 2219, 'number_of_characters': 1277126, 'unique_pairs': 2219, 'min_sentence1_length': 59, 'average_sentence1_length': 284.61, 'max_sentence1_length': 2439, 'unique_sentence1': 2219, 'min_sentence2_length': 50, 'average_sentence2_length': 290.93, 'max_sentence2_length': 2496, 'unique_sentence2': 2219}, 'eng-hrv': {'num_samples': 336, 'number_of_characters': 247780, 'unique_pairs': 336, 'min_sentence1_length': 79, 'average_sentence1_length': 370.68, 'max_sentence1_length': 1657, 'unique_sentence1': 336, 'min_sentence2_length': 59, 'average_sentence2_length': 366.76, 'max_sentence2_length': 1393, 'unique_sentence2': 336}, 'eng-hun': {'num_samples': 2185, 'number_of_characters': 1517498, 'unique_pairs': 2185, 'min_sentence1_length': 61, 'average_sentence1_length': 334.86, 'max_sentence1_length': 1664, 'unique_sentence1': 2185, 'min_sentence2_length': 55, 'average_sentence2_length': 359.65, 'max_sentence2_length': 1814, 'unique_sentence2': 2185}, 'eng-ind': {'num_samples': 3454, 'number_of_characters': 2493124, 'unique_pairs': 3454, 'min_sentence1_length': 60, 'average_sentence1_length': 344.73, 'max_sentence1_length': 4253, 'unique_sentence1': 3454, 'min_sentence2_length': 67, 'average_sentence2_length': 377.07, 'max_sentence2_length': 4132, 'unique_sentence2': 3454}, 'eng-isl': {'num_samples': 358, 'number_of_characters': 237004, 'unique_pairs': 358, 'min_sentence1_length': 72, 'average_sentence1_length': 329.61, 'max_sentence1_length': 1112, 'unique_sentence1': 358, 'min_sentence2_length': 69, 'average_sentence2_length': 332.41, 'max_sentence2_length': 1206, 'unique_sentence2': 358}, 'eng-ita': {'num_samples': 19661, 'number_of_characters': 15893506, 'unique_pairs': 19661, 'min_sentence1_length': 54, 'average_sentence1_length': 381.62, 'max_sentence1_length': 14540, 'unique_sentence1': 19661, 'min_sentence2_length': 49, 'average_sentence2_length': 426.75, 'max_sentence2_length': 16311, 'unique_sentence2': 19661}, 'eng-jpn': {'num_samples': 3807, 'number_of_characters': 1890484, 'unique_pairs': 3807, 'min_sentence1_length': 66, 'average_sentence1_length': 323.35, 'max_sentence1_length': 1738, 'unique_sentence1': 3807, 'min_sentence2_length': 47, 'average_sentence2_length': 173.23, 'max_sentence2_length': 856, 'unique_sentence2': 3807}, 'eng-kaz': {'num_samples': 346, 'number_of_characters': 245791, 'unique_pairs': 346, 'min_sentence1_length': 72, 'average_sentence1_length': 349.87, 'max_sentence1_length': 2126, 'unique_sentence1': 346, 'min_sentence2_length': 79, 'average_sentence2_length': 360.51, 'max_sentence2_length': 2052, 'unique_sentence2': 346}, 'eng-kor': {'num_samples': 2558, 'number_of_characters': 1358932, 'unique_pairs': 2558, 'min_sentence1_length': 58, 'average_sentence1_length': 342.45, 'max_sentence1_length': 11054, 'unique_sentence1': 2558, 'min_sentence2_length': 47, 'average_sentence2_length': 188.79, 'max_sentence2_length': 6212, 'unique_sentence2': 2558}, 'eng-lav': {'num_samples': 1079, 'number_of_characters': 765748, 'unique_pairs': 1079, 'min_sentence1_length': 63, 'average_sentence1_length': 352.18, 'max_sentence1_length': 3960, 'unique_sentence1': 1079, 'min_sentence2_length': 65, 'average_sentence2_length': 357.5, 'max_sentence2_length': 3996, 'unique_sentence2': 1079}, 'eng-lit': {'num_samples': 1185, 'number_of_characters': 884963, 'unique_pairs': 1185, 'min_sentence1_length': 70, 'average_sentence1_length': 369.4, 'max_sentence1_length': 3955, 'unique_sentence1': 1185, 'min_sentence2_length': 87, 'average_sentence2_length': 377.4, 'max_sentence2_length': 3841, 'unique_sentence2': 1185}, 'eng-mar': {'num_samples': 280, 'number_of_characters': 153207, 'unique_pairs': 280, 'min_sentence1_length': 65, 'average_sentence1_length': 274.54, 'max_sentence1_length': 883, 'unique_sentence1': 280, 'min_sentence2_length': 57, 'average_sentence2_length': 272.62, 'max_sentence2_length': 943, 'unique_sentence2': 280}, 'eng-msa': {'num_samples': 469, 'number_of_characters': 205989, 'unique_pairs': 469, 'min_sentence1_length': 58, 'average_sentence1_length': 209.64, 'max_sentence1_length': 1842, 'unique_sentence1': 469, 'min_sentence2_length': 58, 'average_sentence2_length': 229.57, 'max_sentence2_length': 2082, 'unique_sentence2': 469}, 'eng-nld': {'num_samples': 15613, 'number_of_characters': 12313146, 'unique_pairs': 15613, 'min_sentence1_length': 55, 'average_sentence1_length': 378.03, 'max_sentence1_length': 15297, 'unique_sentence1': 15613, 'min_sentence2_length': 54, 'average_sentence2_length': 410.62, 'max_sentence2_length': 16485, 'unique_sentence2': 15613}, 'eng-nor': {'num_samples': 2666, 'number_of_characters': 1883809, 'unique_pairs': 2666, 'min_sentence1_length': 55, 'average_sentence1_length': 353.08, 'max_sentence1_length': 2834, 'unique_sentence1': 2666, 'min_sentence2_length': 56, 'average_sentence2_length': 353.53, 'max_sentence2_length': 2795, 'unique_sentence2': 2666}, 'eng-pol': {'num_samples': 6868, 'number_of_characters': 4946440, 'unique_pairs': 6868, 'min_sentence1_length': 54, 'average_sentence1_length': 349.29, 'max_sentence1_length': 4412, 'unique_sentence1': 6868, 'min_sentence2_length': 57, 'average_sentence2_length': 370.92, 'max_sentence2_length': 5103, 'unique_sentence2': 6868}, 'eng-por': {'num_samples': 12406, 'number_of_characters': 9040635, 'unique_pairs': 12406, 'min_sentence1_length': 58, 'average_sentence1_length': 347.94, 'max_sentence1_length': 6643, 'unique_sentence1': 12406, 'min_sentence2_length': 56, 'average_sentence2_length': 380.79, 'max_sentence2_length': 7445, 'unique_sentence2': 12406}, 'eng-ron': {'num_samples': 3039, 'number_of_characters': 2119434, 'unique_pairs': 3039, 'min_sentence1_length': 61, 'average_sentence1_length': 328.93, 'max_sentence1_length': 2085, 'unique_sentence1': 3039, 'min_sentence2_length': 70, 'average_sentence2_length': 368.48, 'max_sentence2_length': 2421, 'unique_sentence2': 3039}, 'eng-rus': {'num_samples': 9360, 'number_of_characters': 6547558, 'unique_pairs': 9360, 'min_sentence1_length': 54, 'average_sentence1_length': 340.47, 'max_sentence1_length': 3382, 'unique_sentence1': 9360, 'min_sentence2_length': 56, 'average_sentence2_length': 359.06, 'max_sentence2_length': 4018, 'unique_sentence2': 9360}, 'eng-slk': {'num_samples': 1823, 'number_of_characters': 1287409, 'unique_pairs': 1823, 'min_sentence1_length': 64, 'average_sentence1_length': 353.21, 'max_sentence1_length': 4412, 'unique_sentence1': 1823, 'min_sentence2_length': 68, 'average_sentence2_length': 352.99, 'max_sentence2_length': 4641, 'unique_sentence2': 1823}, 'eng-slv': {'num_samples': 1450, 'number_of_characters': 948874, 'unique_pairs': 1450, 'min_sentence1_length': 61, 'average_sentence1_length': 327.63, 'max_sentence1_length': 2058, 'unique_sentence1': 1450, 'min_sentence2_length': 59, 'average_sentence2_length': 326.77, 'max_sentence2_length': 2049, 'unique_sentence2': 1450}, 'eng-spa': {'num_samples': 35446, 'number_of_characters': 29514190, 'unique_pairs': 35446, 'min_sentence1_length': 52, 'average_sentence1_length': 391.45, 'max_sentence1_length': 16651, 'unique_sentence1': 35446, 'min_sentence2_length': 47, 'average_sentence2_length': 441.2, 'max_sentence2_length': 21081, 'unique_sentence2': 35446}, 'eng-srp': {'num_samples': 303, 'number_of_characters': 196853, 'unique_pairs': 303, 'min_sentence1_length': 85, 'average_sentence1_length': 323.19, 'max_sentence1_length': 1054, 'unique_sentence1': 303, 'min_sentence2_length': 78, 'average_sentence2_length': 326.49, 'max_sentence2_length': 1025, 'unique_sentence2': 303}, 'eng-swe': {'num_samples': 6005, 'number_of_characters': 4500597, 'unique_pairs': 6005, 'min_sentence1_length': 54, 'average_sentence1_length': 373.82, 'max_sentence1_length': 14540, 'unique_sentence1': 6005, 'min_sentence2_length': 56, 'average_sentence2_length': 375.65, 'max_sentence2_length': 15020, 'unique_sentence2': 6005}, 'eng-tgl': {'num_samples': 551, 'number_of_characters': 426303, 'unique_pairs': 551, 'min_sentence1_length': 61, 'average_sentence1_length': 346.09, 'max_sentence1_length': 1958, 'unique_sentence1': 551, 'min_sentence2_length': 57, 'average_sentence2_length': 427.6, 'max_sentence2_length': 2422, 'unique_sentence2': 551}, 'eng-tha': {'num_samples': 814, 'number_of_characters': 426960, 'unique_pairs': 814, 'min_sentence1_length': 67, 'average_sentence1_length': 274.76, 'max_sentence1_length': 1842, 'unique_sentence1': 814, 'min_sentence2_length': 52, 'average_sentence2_length': 249.76, 'max_sentence2_length': 1669, 'unique_sentence2': 814}, 'eng-tur': {'num_samples': 4606, 'number_of_characters': 3315390, 'unique_pairs': 4606, 'min_sentence1_length': 58, 'average_sentence1_length': 353.38, 'max_sentence1_length': 5595, 'unique_sentence1': 4606, 'min_sentence2_length': 58, 'average_sentence2_length': 366.42, 'max_sentence2_length': 6024, 'unique_sentence2': 4606}, 'eng-ukr': {'num_samples': 3778, 'number_of_characters': 2803318, 'unique_pairs': 3778, 'min_sentence1_length': 59, 'average_sentence1_length': 367.93, 'max_sentence1_length': 3627, 'unique_sentence1': 3778, 'min_sentence2_length': 49, 'average_sentence2_length': 374.08, 'max_sentence2_length': 3991, 'unique_sentence2': 3778}, 'eng-urd': {'num_samples': 268, 'number_of_characters': 137458, 'unique_pairs': 268, 'min_sentence1_length': 67, 'average_sentence1_length': 253.83, 'max_sentence1_length': 736, 'unique_sentence1': 268, 'min_sentence2_length': 57, 'average_sentence2_length': 259.07, 'max_sentence2_length': 795, 'unique_sentence2': 268}, 'eng-vie': {'num_samples': 1264, 'number_of_characters': 866332, 'unique_pairs': 1264, 'min_sentence1_length': 60, 'average_sentence1_length': 330.71, 'max_sentence1_length': 4253, 'unique_sentence1': 1264, 'min_sentence2_length': 59, 'average_sentence2_length': 354.68, 'max_sentence2_length': 3780, 'unique_sentence2': 1264}, 'eng-zho': {'num_samples': 4959, 'number_of_characters': 2696879, 'unique_pairs': 4959, 'min_sentence1_length': 75, 'average_sentence1_length': 402.38, 'max_sentence1_length': 14540, 'unique_sentence1': 4959, 'min_sentence2_length': 41, 'average_sentence2_length': 141.46, 'max_sentence2_length': 4500, 'unique_sentence2': 4959}}}} | | [WebFAQBitextMiningQuestions](https://huggingface.co/PaDaS-Lab) (Michael Dinzinger, 2025) | ['ara', 'aze', 'ben', 'bul', 'cat', 'ces', 'dan', 'deu', 'ell', 'eng', 'est', 'fas', 'fin', 'fra', 'heb', 'hin', 'hrv', 'hun', 'ind', 'isl', 'ita', 'jpn', 'kat', 'kaz', 'kor', 'lav', 'lit', 'mar', 'msa', 'nld', 'nor', 'pol', 'por', 'ron', 'rus', 'slk', 'slv', 'spa', 'srp', 'swe', 'tgl', 'tha', 'tur', 'ukr', 'urd', 'vie', 'zho'] | BitextMining | s2s | [Web, Written] | {'default': 682057} | {'default': {'num_samples': 682057, 'number_of_characters': 71984237, 'unique_pairs': 681597, 'min_sentence1_length': 6, 'average_sentence1_length': 52.72, 'max_sentence1_length': 847, 'unique_sentence1': 379086, 'min_sentence2_length': 6, 'average_sentence2_length': 52.82, 'max_sentence2_length': 773, 'unique_sentence2': 398743, 'hf_subset_descriptive_stats': {'ara-fas': {'num_samples': 609, 'number_of_characters': 53319, 'unique_pairs': 609, 'min_sentence1_length': 10, 'average_sentence1_length': 41.9, 'max_sentence1_length': 164, 'unique_sentence1': 609, 'min_sentence2_length': 11, 'average_sentence2_length': 45.65, 'max_sentence2_length': 177, 'unique_sentence2': 609}, 'ara-heb': {'num_samples': 978, 'number_of_characters': 93188, 'unique_pairs': 978, 'min_sentence1_length': 10, 'average_sentence1_length': 48.34, 'max_sentence1_length': 158, 'unique_sentence1': 978, 'min_sentence2_length': 9, 'average_sentence2_length': 46.95, 'max_sentence2_length': 160, 'unique_sentence2': 978}, 'jpn-kor': {'num_samples': 4820, 'number_of_characters': 310427, 'unique_pairs': 4820, 'min_sentence1_length': 10, 'average_sentence1_length': 31.91, 'max_sentence1_length': 146, 'unique_sentence1': 4820, 'min_sentence2_length': 7, 'average_sentence2_length': 32.5, 'max_sentence2_length': 208, 'unique_sentence2': 4820}, 'jpn-vie': {'num_samples': 1356, 'number_of_characters': 110198, 'unique_pairs': 1356, 'min_sentence1_length': 10, 'average_sentence1_length': 28.55, 'max_sentence1_length': 152, 'unique_sentence1': 1356, 'min_sentence2_length': 10, 'average_sentence2_length': 52.72, 'max_sentence2_length': 344, 'unique_sentence2': 1356}, 'jpn-zho': {'num_samples': 1728, 'number_of_characters': 86936, 'unique_pairs': 1728, 'min_sentence1_length': 7, 'average_sentence1_length': 29.03, 'max_sentence1_length': 135, 'unique_sentence1': 1728, 'min_sentence2_length': 8, 'average_sentence2_length': 21.28, 'max_sentence2_length': 126, 'unique_sentence2': 1728}, 'kor-vie': {'num_samples': 1386, 'number_of_characters': 116716, 'unique_pairs': 1386, 'min_sentence1_length': 10, 'average_sentence1_length': 30.34, 'max_sentence1_length': 108, 'unique_sentence1': 1386, 'min_sentence2_length': 10, 'average_sentence2_length': 53.87, 'max_sentence2_length': 176, 'unique_sentence2': 1386}, 'kor-zho': {'num_samples': 1087, 'number_of_characters': 56152, 'unique_pairs': 1087, 'min_sentence1_length': 10, 'average_sentence1_length': 30.24, 'max_sentence1_length': 109, 'unique_sentence1': 1087, 'min_sentence2_length': 10, 'average_sentence2_length': 21.42, 'max_sentence2_length': 95, 'unique_sentence2': 1087}, 'vie-zho': {'num_samples': 646, 'number_of_characters': 46981, 'unique_pairs': 646, 'min_sentence1_length': 11, 'average_sentence1_length': 52.13, 'max_sentence1_length': 258, 'unique_sentence1': 646, 'min_sentence2_length': 10, 'average_sentence2_length': 20.6, 'max_sentence2_length': 78, 'unique_sentence2': 646}, 'ind-msa': {'num_samples': 455, 'number_of_characters': 61802, 'unique_pairs': 455, 'min_sentence1_length': 17, 'average_sentence1_length': 69.13, 'max_sentence1_length': 135, 'unique_sentence1': 455, 'min_sentence2_length': 16, 'average_sentence2_length': 66.7, 'max_sentence2_length': 156, 'unique_sentence2': 455}, 'ind-tgl': {'num_samples': 378, 'number_of_characters': 40452, 'unique_pairs': 378, 'min_sentence1_length': 13, 'average_sentence1_length': 51.3, 'max_sentence1_length': 122, 'unique_sentence1': 378, 'min_sentence2_length': 13, 'average_sentence2_length': 55.71, 'max_sentence2_length': 136, 'unique_sentence2': 378}, 'ind-tha': {'num_samples': 1258, 'number_of_characters': 129949, 'unique_pairs': 1258, 'min_sentence1_length': 11, 'average_sentence1_length': 56.66, 'max_sentence1_length': 159, 'unique_sentence1': 1258, 'min_sentence2_length': 11, 'average_sentence2_length': 46.64, 'max_sentence2_length': 153, 'unique_sentence2': 1258}, 'bul-ces': {'num_samples': 1485, 'number_of_characters': 156647, 'unique_pairs': 1485, 'min_sentence1_length': 12, 'average_sentence1_length': 55.73, 'max_sentence1_length': 273, 'unique_sentence1': 1485, 'min_sentence2_length': 10, 'average_sentence2_length': 49.75, 'max_sentence2_length': 265, 'unique_sentence2': 1485}, 'bul-lav': {'num_samples': 710, 'number_of_characters': 72767, 'unique_pairs': 710, 'min_sentence1_length': 14, 'average_sentence1_length': 52.81, 'max_sentence1_length': 169, 'unique_sentence1': 710, 'min_sentence2_length': 13, 'average_sentence2_length': 49.68, 'max_sentence2_length': 137, 'unique_sentence2': 710}, 'bul-lit': {'num_samples': 803, 'number_of_characters': 83819, 'unique_pairs': 803, 'min_sentence1_length': 14, 'average_sentence1_length': 53.81, 'max_sentence1_length': 223, 'unique_sentence1': 803, 'min_sentence2_length': 14, 'average_sentence2_length': 50.57, 'max_sentence2_length': 188, 'unique_sentence2': 803}, 'bul-pol': {'num_samples': 1635, 'number_of_characters': 182803, 'unique_pairs': 1635, 'min_sentence1_length': 12, 'average_sentence1_length': 56.71, 'max_sentence1_length': 224, 'unique_sentence1': 1635, 'min_sentence2_length': 14, 'average_sentence2_length': 55.1, 'max_sentence2_length': 223, 'unique_sentence2': 1635}, 'bul-rus': {'num_samples': 1476, 'number_of_characters': 171755, 'unique_pairs': 1476, 'min_sentence1_length': 11, 'average_sentence1_length': 58.61, 'max_sentence1_length': 273, 'unique_sentence1': 1476, 'min_sentence2_length': 13, 'average_sentence2_length': 57.75, 'max_sentence2_length': 263, 'unique_sentence2': 1476}, 'bul-slk': {'num_samples': 1154, 'number_of_characters': 122062, 'unique_pairs': 1154, 'min_sentence1_length': 14, 'average_sentence1_length': 55.47, 'max_sentence1_length': 273, 'unique_sentence1': 1154, 'min_sentence2_length': 12, 'average_sentence2_length': 50.31, 'max_sentence2_length': 234, 'unique_sentence2': 1154}, 'bul-slv': {'num_samples': 1034, 'number_of_characters': 120376, 'unique_pairs': 1034, 'min_sentence1_length': 13, 'average_sentence1_length': 60.39, 'max_sentence1_length': 273, 'unique_sentence1': 1034, 'min_sentence2_length': 12, 'average_sentence2_length': 56.03, 'max_sentence2_length': 252, 'unique_sentence2': 1034}, 'bul-srp': {'num_samples': 296, 'number_of_characters': 29879, 'unique_pairs': 296, 'min_sentence1_length': 17, 'average_sentence1_length': 51.28, 'max_sentence1_length': 146, 'unique_sentence1': 296, 'min_sentence2_length': 10, 'average_sentence2_length': 49.66, 'max_sentence2_length': 118, 'unique_sentence2': 296}, 'bul-ukr': {'num_samples': 1074, 'number_of_characters': 120520, 'unique_pairs': 1074, 'min_sentence1_length': 11, 'average_sentence1_length': 57.26, 'max_sentence1_length': 155, 'unique_sentence1': 1074, 'min_sentence2_length': 12, 'average_sentence2_length': 54.96, 'max_sentence2_length': 152, 'unique_sentence2': 1074}, 'ces-lav': {'num_samples': 875, 'number_of_characters': 81949, 'unique_pairs': 875, 'min_sentence1_length': 12, 'average_sentence1_length': 45.45, 'max_sentence1_length': 133, 'unique_sentence1': 875, 'min_sentence2_length': 13, 'average_sentence2_length': 48.2, 'max_sentence2_length': 151, 'unique_sentence2': 875}, 'ces-lit': {'num_samples': 1002, 'number_of_characters': 95191, 'unique_pairs': 1002, 'min_sentence1_length': 10, 'average_sentence1_length': 45.99, 'max_sentence1_length': 129, 'unique_sentence1': 1002, 'min_sentence2_length': 12, 'average_sentence2_length': 49.01, 'max_sentence2_length': 177, 'unique_sentence2': 1002}, 'ces-pol': {'num_samples': 3367, 'number_of_characters': 335068, 'unique_pairs': 3367, 'min_sentence1_length': 10, 'average_sentence1_length': 47.42, 'max_sentence1_length': 211, 'unique_sentence1': 3367, 'min_sentence2_length': 13, 'average_sentence2_length': 52.09, 'max_sentence2_length': 231, 'unique_sentence2': 3367}, 'ces-rus': {'num_samples': 2144, 'number_of_characters': 221574, 'unique_pairs': 2144, 'min_sentence1_length': 10, 'average_sentence1_length': 48.92, 'max_sentence1_length': 265, 'unique_sentence1': 2144, 'min_sentence2_length': 13, 'average_sentence2_length': 54.42, 'max_sentence2_length': 263, 'unique_sentence2': 2144}, 'ces-slk': {'num_samples': 2551, 'number_of_characters': 253025, 'unique_pairs': 2551, 'min_sentence1_length': 10, 'average_sentence1_length': 48.91, 'max_sentence1_length': 265, 'unique_sentence1': 2551, 'min_sentence2_length': 10, 'average_sentence2_length': 50.28, 'max_sentence2_length': 237, 'unique_sentence2': 2551}, 'ces-slv': {'num_samples': 1370, 'number_of_characters': 145413, 'unique_pairs': 1370, 'min_sentence1_length': 10, 'average_sentence1_length': 51.01, 'max_sentence1_length': 265, 'unique_sentence1': 1370, 'min_sentence2_length': 11, 'average_sentence2_length': 55.13, 'max_sentence2_length': 252, 'unique_sentence2': 1370}, 'ces-srp': {'num_samples': 362, 'number_of_characters': 37517, 'unique_pairs': 362, 'min_sentence1_length': 15, 'average_sentence1_length': 49.99, 'max_sentence1_length': 129, 'unique_sentence1': 362, 'min_sentence2_length': 16, 'average_sentence2_length': 53.65, 'max_sentence2_length': 129, 'unique_sentence2': 362}, 'ces-ukr': {'num_samples': 1285, 'number_of_characters': 133762, 'unique_pairs': 1285, 'min_sentence1_length': 10, 'average_sentence1_length': 49.67, 'max_sentence1_length': 233, 'unique_sentence1': 1285, 'min_sentence2_length': 12, 'average_sentence2_length': 54.42, 'max_sentence2_length': 253, 'unique_sentence2': 1285}, 'hrv-slk': {'num_samples': 313, 'number_of_characters': 38510, 'unique_pairs': 313, 'min_sentence1_length': 16, 'average_sentence1_length': 61.02, 'max_sentence1_length': 174, 'unique_sentence1': 313, 'min_sentence2_length': 15, 'average_sentence2_length': 62.02, 'max_sentence2_length': 182, 'unique_sentence2': 313}, 'kat-rus': {'num_samples': 262, 'number_of_characters': 26840, 'unique_pairs': 262, 'min_sentence1_length': 14, 'average_sentence1_length': 51.6, 'max_sentence1_length': 179, 'unique_sentence1': 262, 'min_sentence2_length': 16, 'average_sentence2_length': 50.85, 'max_sentence2_length': 186, 'unique_sentence2': 262}, 'lav-lit': {'num_samples': 1061, 'number_of_characters': 101845, 'unique_pairs': 1061, 'min_sentence1_length': 11, 'average_sentence1_length': 47.87, 'max_sentence1_length': 283, 'unique_sentence1': 1061, 'min_sentence2_length': 12, 'average_sentence2_length': 48.12, 'max_sentence2_length': 268, 'unique_sentence2': 1061}, 'lav-pol': {'num_samples': 951, 'number_of_characters': 94163, 'unique_pairs': 951, 'min_sentence1_length': 11, 'average_sentence1_length': 48.29, 'max_sentence1_length': 172, 'unique_sentence1': 951, 'min_sentence2_length': 14, 'average_sentence2_length': 50.72, 'max_sentence2_length': 208, 'unique_sentence2': 951}, 'lav-rus': {'num_samples': 1412, 'number_of_characters': 141609, 'unique_pairs': 1412, 'min_sentence1_length': 11, 'average_sentence1_length': 49.2, 'max_sentence1_length': 146, 'unique_sentence1': 1412, 'min_sentence2_length': 13, 'average_sentence2_length': 51.09, 'max_sentence2_length': 182, 'unique_sentence2': 1412}, 'lav-slk': {'num_samples': 789, 'number_of_characters': 72654, 'unique_pairs': 789, 'min_sentence1_length': 13, 'average_sentence1_length': 46.75, 'max_sentence1_length': 172, 'unique_sentence1': 789, 'min_sentence2_length': 12, 'average_sentence2_length': 45.33, 'max_sentence2_length': 162, 'unique_sentence2': 789}, 'lav-slv': {'num_samples': 518, 'number_of_characters': 48873, 'unique_pairs': 518, 'min_sentence1_length': 13, 'average_sentence1_length': 46.73, 'max_sentence1_length': 126, 'unique_sentence1': 518, 'min_sentence2_length': 13, 'average_sentence2_length': 47.62, 'max_sentence2_length': 125, 'unique_sentence2': 518}, 'lav-ukr': {'num_samples': 579, 'number_of_characters': 52695, 'unique_pairs': 579, 'min_sentence1_length': 11, 'average_sentence1_length': 45.8, 'max_sentence1_length': 151, 'unique_sentence1': 579, 'min_sentence2_length': 12, 'average_sentence2_length': 45.21, 'max_sentence2_length': 170, 'unique_sentence2': 579}, 'lit-pol': {'num_samples': 1026, 'number_of_characters': 102814, 'unique_pairs': 1026, 'min_sentence1_length': 13, 'average_sentence1_length': 48.94, 'max_sentence1_length': 188, 'unique_sentence1': 1026, 'min_sentence2_length': 15, 'average_sentence2_length': 51.27, 'max_sentence2_length': 206, 'unique_sentence2': 1026}, 'lit-rus': {'num_samples': 961, 'number_of_characters': 96822, 'unique_pairs': 961, 'min_sentence1_length': 12, 'average_sentence1_length': 49.5, 'max_sentence1_length': 155, 'unique_sentence1': 961, 'min_sentence2_length': 12, 'average_sentence2_length': 51.25, 'max_sentence2_length': 182, 'unique_sentence2': 961}, 'lit-slk': {'num_samples': 859, 'number_of_characters': 82605, 'unique_pairs': 859, 'min_sentence1_length': 12, 'average_sentence1_length': 49.03, 'max_sentence1_length': 188, 'unique_sentence1': 859, 'min_sentence2_length': 10, 'average_sentence2_length': 47.13, 'max_sentence2_length': 195, 'unique_sentence2': 859}, 'lit-slv': {'num_samples': 607, 'number_of_characters': 58755, 'unique_pairs': 607, 'min_sentence1_length': 12, 'average_sentence1_length': 48.06, 'max_sentence1_length': 155, 'unique_sentence1': 607, 'min_sentence2_length': 11, 'average_sentence2_length': 48.74, 'max_sentence2_length': 145, 'unique_sentence2': 607}, 'lit-ukr': {'num_samples': 639, 'number_of_characters': 59450, 'unique_pairs': 639, 'min_sentence1_length': 14, 'average_sentence1_length': 46.94, 'max_sentence1_length': 149, 'unique_sentence1': 639, 'min_sentence2_length': 13, 'average_sentence2_length': 46.1, 'max_sentence2_length': 170, 'unique_sentence2': 639}, 'pol-rus': {'num_samples': 5014, 'number_of_characters': 536093, 'unique_pairs': 5014, 'min_sentence1_length': 12, 'average_sentence1_length': 53.29, 'max_sentence1_length': 424, 'unique_sentence1': 5014, 'min_sentence2_length': 11, 'average_sentence2_length': 53.63, 'max_sentence2_length': 456, 'unique_sentence2': 5014}, 'pol-slk': {'num_samples': 1918, 'number_of_characters': 196496, 'unique_pairs': 1918, 'min_sentence1_length': 14, 'average_sentence1_length': 53.03, 'max_sentence1_length': 206, 'unique_sentence1': 1918, 'min_sentence2_length': 10, 'average_sentence2_length': 49.42, 'max_sentence2_length': 194, 'unique_sentence2': 1918}, 'pol-slv': {'num_samples': 1382, 'number_of_characters': 160396, 'unique_pairs': 1382, 'min_sentence1_length': 9, 'average_sentence1_length': 57.59, 'max_sentence1_length': 180, 'unique_sentence1': 1382, 'min_sentence2_length': 8, 'average_sentence2_length': 58.47, 'max_sentence2_length': 146, 'unique_sentence2': 1382}, 'pol-srp': {'num_samples': 492, 'number_of_characters': 54772, 'unique_pairs': 492, 'min_sentence1_length': 19, 'average_sentence1_length': 54.55, 'max_sentence1_length': 148, 'unique_sentence1': 492, 'min_sentence2_length': 16, 'average_sentence2_length': 56.77, 'max_sentence2_length': 129, 'unique_sentence2': 492}, 'pol-ukr': {'num_samples': 2370, 'number_of_characters': 251424, 'unique_pairs': 2370, 'min_sentence1_length': 14, 'average_sentence1_length': 53.8, 'max_sentence1_length': 305, 'unique_sentence1': 2370, 'min_sentence2_length': 12, 'average_sentence2_length': 52.29, 'max_sentence2_length': 202, 'unique_sentence2': 2370}, 'rus-slk': {'num_samples': 1263, 'number_of_characters': 136750, 'unique_pairs': 1263, 'min_sentence1_length': 13, 'average_sentence1_length': 56.65, 'max_sentence1_length': 316, 'unique_sentence1': 1263, 'min_sentence2_length': 10, 'average_sentence2_length': 51.63, 'max_sentence2_length': 309, 'unique_sentence2': 1263}, 'rus-slv': {'num_samples': 1096, 'number_of_characters': 133263, 'unique_pairs': 1096, 'min_sentence1_length': 14, 'average_sentence1_length': 62.56, 'max_sentence1_length': 263, 'unique_sentence1': 1096, 'min_sentence2_length': 11, 'average_sentence2_length': 59.03, 'max_sentence2_length': 252, 'unique_sentence2': 1096}, 'rus-srp': {'num_samples': 455, 'number_of_characters': 51800, 'unique_pairs': 455, 'min_sentence1_length': 17, 'average_sentence1_length': 57.52, 'max_sentence1_length': 122, 'unique_sentence1': 455, 'min_sentence2_length': 12, 'average_sentence2_length': 56.33, 'max_sentence2_length': 129, 'unique_sentence2': 455}, 'rus-ukr': {'num_samples': 15251, 'number_of_characters': 1504266, 'unique_pairs': 15251, 'min_sentence1_length': 10, 'average_sentence1_length': 49.98, 'max_sentence1_length': 308, 'unique_sentence1': 15251, 'min_sentence2_length': 10, 'average_sentence2_length': 48.66, 'max_sentence2_length': 353, 'unique_sentence2': 15251}, 'slk-slv': {'num_samples': 1259, 'number_of_characters': 133621, 'unique_pairs': 1259, 'min_sentence1_length': 10, 'average_sentence1_length': 50.42, 'max_sentence1_length': 234, 'unique_sentence1': 1259, 'min_sentence2_length': 11, 'average_sentence2_length': 55.71, 'max_sentence2_length': 252, 'unique_sentence2': 1259}, 'slk-srp': {'num_samples': 561, 'number_of_characters': 57637, 'unique_pairs': 561, 'min_sentence1_length': 15, 'average_sentence1_length': 47.93, 'max_sentence1_length': 117, 'unique_sentence1': 561, 'min_sentence2_length': 16, 'average_sentence2_length': 54.81, 'max_sentence2_length': 112, 'unique_sentence2': 561}, 'slk-ukr': {'num_samples': 944, 'number_of_characters': 90612, 'unique_pairs': 944, 'min_sentence1_length': 10, 'average_sentence1_length': 46.87, 'max_sentence1_length': 237, 'unique_sentence1': 944, 'min_sentence2_length': 12, 'average_sentence2_length': 49.11, 'max_sentence2_length': 253, 'unique_sentence2': 944}, 'slv-srp': {'num_samples': 499, 'number_of_characters': 60828, 'unique_pairs': 499, 'min_sentence1_length': 16, 'average_sentence1_length': 63.64, 'max_sentence1_length': 122, 'unique_sentence1': 499, 'min_sentence2_length': 16, 'average_sentence2_length': 58.26, 'max_sentence2_length': 118, 'unique_sentence2': 499}, 'slv-ukr': {'num_samples': 733, 'number_of_characters': 86586, 'unique_pairs': 733, 'min_sentence1_length': 11, 'average_sentence1_length': 57.88, 'max_sentence1_length': 124, 'unique_sentence1': 733, 'min_sentence2_length': 12, 'average_sentence2_length': 60.24, 'max_sentence2_length': 132, 'unique_sentence2': 733}, 'cat-deu': {'num_samples': 302, 'number_of_characters': 33752, 'unique_pairs': 302, 'min_sentence1_length': 13, 'average_sentence1_length': 53.35, 'max_sentence1_length': 255, 'unique_sentence1': 302, 'min_sentence2_length': 12, 'average_sentence2_length': 58.41, 'max_sentence2_length': 269, 'unique_sentence2': 302}, 'cat-fra': {'num_samples': 598, 'number_of_characters': 69263, 'unique_pairs': 598, 'min_sentence1_length': 14, 'average_sentence1_length': 53.4, 'max_sentence1_length': 202, 'unique_sentence1': 598, 'min_sentence2_length': 16, 'average_sentence2_length': 62.42, 'max_sentence2_length': 235, 'unique_sentence2': 598}, 'cat-ita': {'num_samples': 418, 'number_of_characters': 43550, 'unique_pairs': 418, 'min_sentence1_length': 15, 'average_sentence1_length': 50.67, 'max_sentence1_length': 195, 'unique_sentence1': 418, 'min_sentence2_length': 15, 'average_sentence2_length': 53.51, 'max_sentence2_length': 230, 'unique_sentence2': 418}, 'cat-por': {'num_samples': 370, 'number_of_characters': 36411, 'unique_pairs': 370, 'min_sentence1_length': 15, 'average_sentence1_length': 48.72, 'max_sentence1_length': 193, 'unique_sentence1': 370, 'min_sentence2_length': 15, 'average_sentence2_length': 49.69, 'max_sentence2_length': 211, 'unique_sentence2': 370}, 'cat-spa': {'num_samples': 2648, 'number_of_characters': 285897, 'unique_pairs': 2648, 'min_sentence1_length': 10, 'average_sentence1_length': 52.98, 'max_sentence1_length': 216, 'unique_sentence1': 2648, 'min_sentence2_length': 11, 'average_sentence2_length': 54.98, 'max_sentence2_length': 247, 'unique_sentence2': 2648}, 'dan-deu': {'num_samples': 4337, 'number_of_characters': 463434, 'unique_pairs': 4337, 'min_sentence1_length': 11, 'average_sentence1_length': 50.84, 'max_sentence1_length': 198, 'unique_sentence1': 4337, 'min_sentence2_length': 11, 'average_sentence2_length': 56.02, 'max_sentence2_length': 224, 'unique_sentence2': 4337}, 'dan-fra': {'num_samples': 3802, 'number_of_characters': 434788, 'unique_pairs': 3802, 'min_sentence1_length': 12, 'average_sentence1_length': 51.92, 'max_sentence1_length': 296, 'unique_sentence1': 3802, 'min_sentence2_length': 11, 'average_sentence2_length': 62.44, 'max_sentence2_length': 345, 'unique_sentence2': 3802}, 'dan-isl': {'num_samples': 327, 'number_of_characters': 32853, 'unique_pairs': 327, 'min_sentence1_length': 12, 'average_sentence1_length': 51.35, 'max_sentence1_length': 198, 'unique_sentence1': 327, 'min_sentence2_length': 12, 'average_sentence2_length': 49.11, 'max_sentence2_length': 181, 'unique_sentence2': 327}, 'dan-ita': {'num_samples': 3818, 'number_of_characters': 421045, 'unique_pairs': 3818, 'min_sentence1_length': 12, 'average_sentence1_length': 53.2, 'max_sentence1_length': 296, 'unique_sentence1': 3818, 'min_sentence2_length': 11, 'average_sentence2_length': 57.08, 'max_sentence2_length': 271, 'unique_sentence2': 3818}, 'dan-nld': {'num_samples': 4099, 'number_of_characters': 428737, 'unique_pairs': 4099, 'min_sentence1_length': 11, 'average_sentence1_length': 51.53, 'max_sentence1_length': 225, 'unique_sentence1': 4099, 'min_sentence2_length': 10, 'average_sentence2_length': 53.06, 'max_sentence2_length': 254, 'unique_sentence2': 4099}, 'dan-nor': {'num_samples': 2603, 'number_of_characters': 278953, 'unique_pairs': 2603, 'min_sentence1_length': 13, 'average_sentence1_length': 54.74, 'max_sentence1_length': 296, 'unique_sentence1': 2603, 'min_sentence2_length': 11, 'average_sentence2_length': 52.42, 'max_sentence2_length': 291, 'unique_sentence2': 2603}, 'dan-por': {'num_samples': 3206, 'number_of_characters': 349267, 'unique_pairs': 3206, 'min_sentence1_length': 12, 'average_sentence1_length': 52.78, 'max_sentence1_length': 176, 'unique_sentence1': 3206, 'min_sentence2_length': 10, 'average_sentence2_length': 56.16, 'max_sentence2_length': 211, 'unique_sentence2': 3206}, 'dan-ron': {'num_samples': 2052, 'number_of_characters': 225425, 'unique_pairs': 2052, 'min_sentence1_length': 8, 'average_sentence1_length': 53.82, 'max_sentence1_length': 296, 'unique_sentence1': 2052, 'min_sentence2_length': 9, 'average_sentence2_length': 56.03, 'max_sentence2_length': 281, 'unique_sentence2': 2052}, 'dan-spa': {'num_samples': 3571, 'number_of_characters': 389048, 'unique_pairs': 3571, 'min_sentence1_length': 12, 'average_sentence1_length': 52.06, 'max_sentence1_length': 257, 'unique_sentence1': 3571, 'min_sentence2_length': 12, 'average_sentence2_length': 56.89, 'max_sentence2_length': 239, 'unique_sentence2': 3571}, 'dan-swe': {'num_samples': 4268, 'number_of_characters': 440347, 'unique_pairs': 4268, 'min_sentence1_length': 12, 'average_sentence1_length': 52.09, 'max_sentence1_length': 315, 'unique_sentence1': 4268, 'min_sentence2_length': 11, 'average_sentence2_length': 51.09, 'max_sentence2_length': 330, 'unique_sentence2': 4268}, 'deu-fra': {'num_samples': 27727, 'number_of_characters': 3222716, 'unique_pairs': 27727, 'min_sentence1_length': 6, 'average_sentence1_length': 55.56, 'max_sentence1_length': 337, 'unique_sentence1': 27727, 'min_sentence2_length': 6, 'average_sentence2_length': 60.67, 'max_sentence2_length': 330, 'unique_sentence2': 27727}, 'deu-isl': {'num_samples': 294, 'number_of_characters': 31097, 'unique_pairs': 294, 'min_sentence1_length': 14, 'average_sentence1_length': 56.09, 'max_sentence1_length': 201, 'unique_sentence1': 294, 'min_sentence2_length': 14, 'average_sentence2_length': 49.69, 'max_sentence2_length': 181, 'unique_sentence2': 294}, 'deu-ita': {'num_samples': 18787, 'number_of_characters': 2100285, 'unique_pairs': 18787, 'min_sentence1_length': 10, 'average_sentence1_length': 56.25, 'max_sentence1_length': 346, 'unique_sentence1': 18787, 'min_sentence2_length': 10, 'average_sentence2_length': 55.54, 'max_sentence2_length': 357, 'unique_sentence2': 18787}, 'deu-nld': {'num_samples': 14211, 'number_of_characters': 1508365, 'unique_pairs': 14211, 'min_sentence1_length': 10, 'average_sentence1_length': 54.51, 'max_sentence1_length': 299, 'unique_sentence1': 14211, 'min_sentence2_length': 10, 'average_sentence2_length': 51.63, 'max_sentence2_length': 276, 'unique_sentence2': 14211}, 'deu-nor': {'num_samples': 2783, 'number_of_characters': 295406, 'unique_pairs': 2783, 'min_sentence1_length': 10, 'average_sentence1_length': 56.26, 'max_sentence1_length': 224, 'unique_sentence1': 2783, 'min_sentence2_length': 11, 'average_sentence2_length': 49.89, 'max_sentence2_length': 163, 'unique_sentence2': 2783}, 'deu-por': {'num_samples': 11319, 'number_of_characters': 1213812, 'unique_pairs': 11319, 'min_sentence1_length': 9, 'average_sentence1_length': 54.7, 'max_sentence1_length': 374, 'unique_sentence1': 11319, 'min_sentence2_length': 10, 'average_sentence2_length': 52.54, 'max_sentence2_length': 337, 'unique_sentence2': 11319}, 'deu-ron': {'num_samples': 3598, 'number_of_characters': 401116, 'unique_pairs': 3598, 'min_sentence1_length': 8, 'average_sentence1_length': 57.01, 'max_sentence1_length': 716, 'unique_sentence1': 3598, 'min_sentence2_length': 9, 'average_sentence2_length': 54.48, 'max_sentence2_length': 699, 'unique_sentence2': 3598}, 'deu-spa': {'num_samples': 19739, 'number_of_characters': 2151487, 'unique_pairs': 19739, 'min_sentence1_length': 9, 'average_sentence1_length': 54.84, 'max_sentence1_length': 428, 'unique_sentence1': 19739, 'min_sentence2_length': 10, 'average_sentence2_length': 54.16, 'max_sentence2_length': 415, 'unique_sentence2': 19739}, 'deu-swe': {'num_samples': 5772, 'number_of_characters': 610949, 'unique_pairs': 5772, 'min_sentence1_length': 10, 'average_sentence1_length': 55.78, 'max_sentence1_length': 264, 'unique_sentence1': 5772, 'min_sentence2_length': 10, 'average_sentence2_length': 50.06, 'max_sentence2_length': 237, 'unique_sentence2': 5772}, 'fra-isl': {'num_samples': 347, 'number_of_characters': 39368, 'unique_pairs': 347, 'min_sentence1_length': 14, 'average_sentence1_length': 62.12, 'max_sentence1_length': 229, 'unique_sentence1': 347, 'min_sentence2_length': 16, 'average_sentence2_length': 51.34, 'max_sentence2_length': 194, 'unique_sentence2': 347}, 'fra-ita': {'num_samples': 20002, 'number_of_characters': 2326983, 'unique_pairs': 20002, 'min_sentence1_length': 10, 'average_sentence1_length': 61.12, 'max_sentence1_length': 743, 'unique_sentence1': 20002, 'min_sentence2_length': 8, 'average_sentence2_length': 55.22, 'max_sentence2_length': 653, 'unique_sentence2': 20002}, 'fra-nld': {'num_samples': 14684, 'number_of_characters': 1659621, 'unique_pairs': 14684, 'min_sentence1_length': 10, 'average_sentence1_length': 60.34, 'max_sentence1_length': 400, 'unique_sentence1': 14684, 'min_sentence2_length': 10, 'average_sentence2_length': 52.69, 'max_sentence2_length': 263, 'unique_sentence2': 14684}, 'fra-nor': {'num_samples': 2558, 'number_of_characters': 290540, 'unique_pairs': 2558, 'min_sentence1_length': 10, 'average_sentence1_length': 63.31, 'max_sentence1_length': 377, 'unique_sentence1': 2558, 'min_sentence2_length': 11, 'average_sentence2_length': 50.27, 'max_sentence2_length': 305, 'unique_sentence2': 2558}, 'fra-por': {'num_samples': 13265, 'number_of_characters': 1485292, 'unique_pairs': 13265, 'min_sentence1_length': 10, 'average_sentence1_length': 59.58, 'max_sentence1_length': 284, 'unique_sentence1': 13265, 'min_sentence2_length': 8, 'average_sentence2_length': 52.39, 'max_sentence2_length': 443, 'unique_sentence2': 13265}, 'fra-ron': {'num_samples': 3295, 'number_of_characters': 381273, 'unique_pairs': 3295, 'min_sentence1_length': 8, 'average_sentence1_length': 61.62, 'max_sentence1_length': 677, 'unique_sentence1': 3295, 'min_sentence2_length': 8, 'average_sentence2_length': 54.1, 'max_sentence2_length': 587, 'unique_sentence2': 3295}, 'fra-spa': {'num_samples': 23311, 'number_of_characters': 2650610, 'unique_pairs': 23311, 'min_sentence1_length': 10, 'average_sentence1_length': 59.63, 'max_sentence1_length': 677, 'unique_sentence1': 23311, 'min_sentence2_length': 8, 'average_sentence2_length': 54.08, 'max_sentence2_length': 576, 'unique_sentence2': 23311}, 'fra-swe': {'num_samples': 5006, 'number_of_characters': 565299, 'unique_pairs': 5006, 'min_sentence1_length': 10, 'average_sentence1_length': 62.23, 'max_sentence1_length': 345, 'unique_sentence1': 5006, 'min_sentence2_length': 11, 'average_sentence2_length': 50.7, 'max_sentence2_length': 289, 'unique_sentence2': 5006}, 'isl-ita': {'num_samples': 421, 'number_of_characters': 42563, 'unique_pairs': 421, 'min_sentence1_length': 14, 'average_sentence1_length': 48.12, 'max_sentence1_length': 145, 'unique_sentence1': 421, 'min_sentence2_length': 13, 'average_sentence2_length': 52.98, 'max_sentence2_length': 151, 'unique_sentence2': 421}, 'isl-nld': {'num_samples': 311, 'number_of_characters': 33292, 'unique_pairs': 311, 'min_sentence1_length': 13, 'average_sentence1_length': 51.5, 'max_sentence1_length': 160, 'unique_sentence1': 311, 'min_sentence2_length': 11, 'average_sentence2_length': 55.55, 'max_sentence2_length': 228, 'unique_sentence2': 311}, 'isl-por': {'num_samples': 341, 'number_of_characters': 34367, 'unique_pairs': 341, 'min_sentence1_length': 12, 'average_sentence1_length': 48.83, 'max_sentence1_length': 135, 'unique_sentence1': 341, 'min_sentence2_length': 12, 'average_sentence2_length': 51.96, 'max_sentence2_length': 156, 'unique_sentence2': 341}, 'isl-spa': {'num_samples': 366, 'number_of_characters': 38563, 'unique_pairs': 366, 'min_sentence1_length': 14, 'average_sentence1_length': 49.8, 'max_sentence1_length': 196, 'unique_sentence1': 366, 'min_sentence2_length': 17, 'average_sentence2_length': 55.56, 'max_sentence2_length': 206, 'unique_sentence2': 366}, 'isl-swe': {'num_samples': 312, 'number_of_characters': 30662, 'unique_pairs': 312, 'min_sentence1_length': 14, 'average_sentence1_length': 48.84, 'max_sentence1_length': 149, 'unique_sentence1': 312, 'min_sentence2_length': 15, 'average_sentence2_length': 49.44, 'max_sentence2_length': 141, 'unique_sentence2': 312}, 'ita-nld': {'num_samples': 9160, 'number_of_characters': 973718, 'unique_pairs': 9160, 'min_sentence1_length': 11, 'average_sentence1_length': 54.2, 'max_sentence1_length': 291, 'unique_sentence1': 9160, 'min_sentence2_length': 10, 'average_sentence2_length': 52.1, 'max_sentence2_length': 337, 'unique_sentence2': 9160}, 'ita-nor': {'num_samples': 2516, 'number_of_characters': 267838, 'unique_pairs': 2516, 'min_sentence1_length': 13, 'average_sentence1_length': 56.18, 'max_sentence1_length': 356, 'unique_sentence1': 2516, 'min_sentence2_length': 12, 'average_sentence2_length': 50.28, 'max_sentence2_length': 305, 'unique_sentence2': 2516}, 'ita-por': {'num_samples': 10924, 'number_of_characters': 1157982, 'unique_pairs': 10924, 'min_sentence1_length': 10, 'average_sentence1_length': 53.5, 'max_sentence1_length': 357, 'unique_sentence1': 10924, 'min_sentence2_length': 10, 'average_sentence2_length': 52.5, 'max_sentence2_length': 276, 'unique_sentence2': 10924}, 'ita-ron': {'num_samples': 3360, 'number_of_characters': 375697, 'unique_pairs': 3360, 'min_sentence1_length': 8, 'average_sentence1_length': 56.38, 'max_sentence1_length': 346, 'unique_sentence1': 3360, 'min_sentence2_length': 9, 'average_sentence2_length': 55.43, 'max_sentence2_length': 402, 'unique_sentence2': 3360}, 'ita-spa': {'num_samples': 16534, 'number_of_characters': 1778757, 'unique_pairs': 16534, 'min_sentence1_length': 10, 'average_sentence1_length': 53.56, 'max_sentence1_length': 362, 'unique_sentence1': 16534, 'min_sentence2_length': 11, 'average_sentence2_length': 54.02, 'max_sentence2_length': 363, 'unique_sentence2': 16534}, 'ita-swe': {'num_samples': 4741, 'number_of_characters': 508653, 'unique_pairs': 4741, 'min_sentence1_length': 10, 'average_sentence1_length': 55.87, 'max_sentence1_length': 271, 'unique_sentence1': 4741, 'min_sentence2_length': 10, 'average_sentence2_length': 51.42, 'max_sentence2_length': 289, 'unique_sentence2': 4741}, 'nld-nor': {'num_samples': 2664, 'number_of_characters': 281584, 'unique_pairs': 2664, 'min_sentence1_length': 10, 'average_sentence1_length': 54.62, 'max_sentence1_length': 211, 'unique_sentence1': 2664, 'min_sentence2_length': 11, 'average_sentence2_length': 51.08, 'max_sentence2_length': 162, 'unique_sentence2': 2664}, 'nld-por': {'num_samples': 7021, 'number_of_characters': 738280, 'unique_pairs': 7021, 'min_sentence1_length': 10, 'average_sentence1_length': 51.91, 'max_sentence1_length': 243, 'unique_sentence1': 7021, 'min_sentence2_length': 11, 'average_sentence2_length': 53.24, 'max_sentence2_length': 443, 'unique_sentence2': 7021}, 'nld-ron': {'num_samples': 2888, 'number_of_characters': 311689, 'unique_pairs': 2888, 'min_sentence1_length': 10, 'average_sentence1_length': 53.16, 'max_sentence1_length': 228, 'unique_sentence1': 2888, 'min_sentence2_length': 11, 'average_sentence2_length': 54.77, 'max_sentence2_length': 252, 'unique_sentence2': 2888}, 'nld-spa': {'num_samples': 9555, 'number_of_characters': 1015756, 'unique_pairs': 9555, 'min_sentence1_length': 10, 'average_sentence1_length': 51.91, 'max_sentence1_length': 229, 'unique_sentence1': 9555, 'min_sentence2_length': 10, 'average_sentence2_length': 54.39, 'max_sentence2_length': 235, 'unique_sentence2': 9555}, 'nld-swe': {'num_samples': 5072, 'number_of_characters': 529162, 'unique_pairs': 5072, 'min_sentence1_length': 10, 'average_sentence1_length': 53.53, 'max_sentence1_length': 333, 'unique_sentence1': 5072, 'min_sentence2_length': 10, 'average_sentence2_length': 50.8, 'max_sentence2_length': 311, 'unique_sentence2': 5072}, 'nor-por': {'num_samples': 2096, 'number_of_characters': 227941, 'unique_pairs': 2096, 'min_sentence1_length': 11, 'average_sentence1_length': 51.43, 'max_sentence1_length': 200, 'unique_sentence1': 2096, 'min_sentence2_length': 12, 'average_sentence2_length': 57.32, 'max_sentence2_length': 180, 'unique_sentence2': 2096}, 'nor-ron': {'num_samples': 1412, 'number_of_characters': 156087, 'unique_pairs': 1412, 'min_sentence1_length': 13, 'average_sentence1_length': 52.16, 'max_sentence1_length': 291, 'unique_sentence1': 1412, 'min_sentence2_length': 14, 'average_sentence2_length': 58.39, 'max_sentence2_length': 281, 'unique_sentence2': 1412}, 'nor-spa': {'num_samples': 2603, 'number_of_characters': 283540, 'unique_pairs': 2603, 'min_sentence1_length': 12, 'average_sentence1_length': 51.0, 'max_sentence1_length': 283, 'unique_sentence1': 2603, 'min_sentence2_length': 13, 'average_sentence2_length': 57.93, 'max_sentence2_length': 274, 'unique_sentence2': 2603}, 'nor-swe': {'num_samples': 3165, 'number_of_characters': 325198, 'unique_pairs': 3165, 'min_sentence1_length': 11, 'average_sentence1_length': 50.94, 'max_sentence1_length': 291, 'unique_sentence1': 3165, 'min_sentence2_length': 10, 'average_sentence2_length': 51.81, 'max_sentence2_length': 289, 'unique_sentence2': 3165}, 'por-ron': {'num_samples': 3026, 'number_of_characters': 341012, 'unique_pairs': 3026, 'min_sentence1_length': 12, 'average_sentence1_length': 56.57, 'max_sentence1_length': 334, 'unique_sentence1': 3026, 'min_sentence2_length': 12, 'average_sentence2_length': 56.13, 'max_sentence2_length': 402, 'unique_sentence2': 3026}, 'por-spa': {'num_samples': 16084, 'number_of_characters': 1698580, 'unique_pairs': 16084, 'min_sentence1_length': 10, 'average_sentence1_length': 51.93, 'max_sentence1_length': 350, 'unique_sentence1': 16084, 'min_sentence2_length': 10, 'average_sentence2_length': 53.68, 'max_sentence2_length': 364, 'unique_sentence2': 16084}, 'por-swe': {'num_samples': 4235, 'number_of_characters': 444824, 'unique_pairs': 4235, 'min_sentence1_length': 11, 'average_sentence1_length': 54.48, 'max_sentence1_length': 282, 'unique_sentence1': 4235, 'min_sentence2_length': 11, 'average_sentence2_length': 50.55, 'max_sentence2_length': 255, 'unique_sentence2': 4235}, 'ron-spa': {'num_samples': 3375, 'number_of_characters': 376497, 'unique_pairs': 3375, 'min_sentence1_length': 12, 'average_sentence1_length': 54.71, 'max_sentence1_length': 587, 'unique_sentence1': 3375, 'min_sentence2_length': 11, 'average_sentence2_length': 56.85, 'max_sentence2_length': 576, 'unique_sentence2': 3375}, 'ron-swe': {'num_samples': 2154, 'number_of_characters': 238390, 'unique_pairs': 2154, 'min_sentence1_length': 9, 'average_sentence1_length': 57.04, 'max_sentence1_length': 281, 'unique_sentence1': 2154, 'min_sentence2_length': 8, 'average_sentence2_length': 53.64, 'max_sentence2_length': 289, 'unique_sentence2': 2154}, 'spa-swe': {'num_samples': 4884, 'number_of_characters': 525651, 'unique_pairs': 4884, 'min_sentence1_length': 12, 'average_sentence1_length': 56.58, 'max_sentence1_length': 244, 'unique_sentence1': 4884, 'min_sentence2_length': 10, 'average_sentence2_length': 51.04, 'max_sentence2_length': 280, 'unique_sentence2': 4884}, 'ben-hin': {'num_samples': 1174, 'number_of_characters': 115288, 'unique_pairs': 1174, 'min_sentence1_length': 11, 'average_sentence1_length': 47.5, 'max_sentence1_length': 145, 'unique_sentence1': 1174, 'min_sentence2_length': 13, 'average_sentence2_length': 50.71, 'max_sentence2_length': 149, 'unique_sentence2': 1174}, 'ben-mar': {'num_samples': 566, 'number_of_characters': 54106, 'unique_pairs': 566, 'min_sentence1_length': 12, 'average_sentence1_length': 47.8, 'max_sentence1_length': 145, 'unique_sentence1': 566, 'min_sentence2_length': 14, 'average_sentence2_length': 47.8, 'max_sentence2_length': 130, 'unique_sentence2': 566}, 'ben-urd': {'num_samples': 488, 'number_of_characters': 44387, 'unique_pairs': 488, 'min_sentence1_length': 11, 'average_sentence1_length': 44.0, 'max_sentence1_length': 110, 'unique_sentence1': 488, 'min_sentence2_length': 12, 'average_sentence2_length': 46.96, 'max_sentence2_length': 101, 'unique_sentence2': 488}, 'hin-mar': {'num_samples': 615, 'number_of_characters': 59136, 'unique_pairs': 615, 'min_sentence1_length': 11, 'average_sentence1_length': 49.57, 'max_sentence1_length': 149, 'unique_sentence1': 615, 'min_sentence2_length': 11, 'average_sentence2_length': 46.59, 'max_sentence2_length': 143, 'unique_sentence2': 615}, 'hin-urd': {'num_samples': 545, 'number_of_characters': 52165, 'unique_pairs': 545, 'min_sentence1_length': 14, 'average_sentence1_length': 48.38, 'max_sentence1_length': 111, 'unique_sentence1': 545, 'min_sentence2_length': 12, 'average_sentence2_length': 47.34, 'max_sentence2_length': 125, 'unique_sentence2': 545}, 'mar-urd': {'num_samples': 270, 'number_of_characters': 23951, 'unique_pairs': 270, 'min_sentence1_length': 14, 'average_sentence1_length': 43.16, 'max_sentence1_length': 108, 'unique_sentence1': 270, 'min_sentence2_length': 13, 'average_sentence2_length': 45.54, 'max_sentence2_length': 107, 'unique_sentence2': 270}, 'aze-kaz': {'num_samples': 412, 'number_of_characters': 38912, 'unique_pairs': 412, 'min_sentence1_length': 12, 'average_sentence1_length': 46.7, 'max_sentence1_length': 121, 'unique_sentence1': 412, 'min_sentence2_length': 14, 'average_sentence2_length': 47.75, 'max_sentence2_length': 108, 'unique_sentence2': 412}, 'aze-tur': {'num_samples': 388, 'number_of_characters': 36138, 'unique_pairs': 388, 'min_sentence1_length': 12, 'average_sentence1_length': 46.69, 'max_sentence1_length': 124, 'unique_sentence1': 388, 'min_sentence2_length': 12, 'average_sentence2_length': 46.45, 'max_sentence2_length': 123, 'unique_sentence2': 388}, 'kaz-tur': {'num_samples': 340, 'number_of_characters': 31637, 'unique_pairs': 340, 'min_sentence1_length': 17, 'average_sentence1_length': 47.27, 'max_sentence1_length': 122, 'unique_sentence1': 340, 'min_sentence2_length': 12, 'average_sentence2_length': 45.78, 'max_sentence2_length': 114, 'unique_sentence2': 340}, 'est-fin': {'num_samples': 790, 'number_of_characters': 80226, 'unique_pairs': 790, 'min_sentence1_length': 13, 'average_sentence1_length': 50.35, 'max_sentence1_length': 158, 'unique_sentence1': 790, 'min_sentence2_length': 14, 'average_sentence2_length': 51.21, 'max_sentence2_length': 152, 'unique_sentence2': 790}, 'est-hun': {'num_samples': 674, 'number_of_characters': 69641, 'unique_pairs': 674, 'min_sentence1_length': 8, 'average_sentence1_length': 50.49, 'max_sentence1_length': 157, 'unique_sentence1': 674, 'min_sentence2_length': 9, 'average_sentence2_length': 52.83, 'max_sentence2_length': 180, 'unique_sentence2': 674}, 'fin-hun': {'num_samples': 1542, 'number_of_characters': 167588, 'unique_pairs': 1542, 'min_sentence1_length': 8, 'average_sentence1_length': 53.55, 'max_sentence1_length': 243, 'unique_sentence1': 1542, 'min_sentence2_length': 9, 'average_sentence2_length': 55.13, 'max_sentence2_length': 228, 'unique_sentence2': 1542}, 'ara-eng': {'num_samples': 5698, 'number_of_characters': 544132, 'unique_pairs': 5698, 'min_sentence1_length': 10, 'average_sentence1_length': 45.67, 'max_sentence1_length': 280, 'unique_sentence1': 5698, 'min_sentence2_length': 11, 'average_sentence2_length': 49.82, 'max_sentence2_length': 287, 'unique_sentence2': 5698}, 'aze-eng': {'num_samples': 603, 'number_of_characters': 58907, 'unique_pairs': 603, 'min_sentence1_length': 12, 'average_sentence1_length': 49.94, 'max_sentence1_length': 121, 'unique_sentence1': 603, 'min_sentence2_length': 12, 'average_sentence2_length': 47.75, 'max_sentence2_length': 129, 'unique_sentence2': 603}, 'ben-eng': {'num_samples': 1367, 'number_of_characters': 126399, 'unique_pairs': 1367, 'min_sentence1_length': 10, 'average_sentence1_length': 46.61, 'max_sentence1_length': 147, 'unique_sentence1': 1367, 'min_sentence2_length': 14, 'average_sentence2_length': 45.85, 'max_sentence2_length': 148, 'unique_sentence2': 1367}, 'bul-eng': {'num_samples': 2133, 'number_of_characters': 219893, 'unique_pairs': 2133, 'min_sentence1_length': 12, 'average_sentence1_length': 53.71, 'max_sentence1_length': 273, 'unique_sentence1': 2133, 'min_sentence2_length': 11, 'average_sentence2_length': 49.38, 'max_sentence2_length': 287, 'unique_sentence2': 2133}, 'cat-eng': {'num_samples': 1152, 'number_of_characters': 118852, 'unique_pairs': 1152, 'min_sentence1_length': 10, 'average_sentence1_length': 52.34, 'max_sentence1_length': 202, 'unique_sentence1': 1152, 'min_sentence2_length': 11, 'average_sentence2_length': 50.83, 'max_sentence2_length': 195, 'unique_sentence2': 1152}, 'ces-eng': {'num_samples': 3775, 'number_of_characters': 364386, 'unique_pairs': 3775, 'min_sentence1_length': 10, 'average_sentence1_length': 47.79, 'max_sentence1_length': 265, 'unique_sentence1': 3775, 'min_sentence2_length': 10, 'average_sentence2_length': 48.74, 'max_sentence2_length': 287, 'unique_sentence2': 3775}, 'dan-eng': {'num_samples': 4512, 'number_of_characters': 451232, 'unique_pairs': 4512, 'min_sentence1_length': 10, 'average_sentence1_length': 51.15, 'max_sentence1_length': 296, 'unique_sentence1': 4512, 'min_sentence2_length': 10, 'average_sentence2_length': 48.86, 'max_sentence2_length': 287, 'unique_sentence2': 4512}, 'deu-eng': {'num_samples': 37348, 'number_of_characters': 3890899, 'unique_pairs': 37348, 'min_sentence1_length': 8, 'average_sentence1_length': 55.15, 'max_sentence1_length': 716, 'unique_sentence1': 37348, 'min_sentence2_length': 9, 'average_sentence2_length': 49.03, 'max_sentence2_length': 586, 'unique_sentence2': 37348}, 'ell-eng': {'num_samples': 2790, 'number_of_characters': 302459, 'unique_pairs': 2790, 'min_sentence1_length': 14, 'average_sentence1_length': 58.78, 'max_sentence1_length': 286, 'unique_sentence1': 2790, 'min_sentence2_length': 11, 'average_sentence2_length': 49.63, 'max_sentence2_length': 243, 'unique_sentence2': 2790}, 'eng-est': {'num_samples': 755, 'number_of_characters': 75289, 'unique_pairs': 755, 'min_sentence1_length': 12, 'average_sentence1_length': 49.1, 'max_sentence1_length': 152, 'unique_sentence1': 755, 'min_sentence2_length': 15, 'average_sentence2_length': 50.62, 'max_sentence2_length': 160, 'unique_sentence2': 755}, 'eng-fas': {'num_samples': 556, 'number_of_characters': 52628, 'unique_pairs': 556, 'min_sentence1_length': 12, 'average_sentence1_length': 48.16, 'max_sentence1_length': 184, 'unique_sentence1': 556, 'min_sentence2_length': 10, 'average_sentence2_length': 46.49, 'max_sentence2_length': 161, 'unique_sentence2': 556}, 'eng-fin': {'num_samples': 3443, 'number_of_characters': 348436, 'unique_pairs': 3443, 'min_sentence1_length': 11, 'average_sentence1_length': 49.68, 'max_sentence1_length': 410, 'unique_sentence1': 3443, 'min_sentence2_length': 11, 'average_sentence2_length': 51.52, 'max_sentence2_length': 387, 'unique_sentence2': 3443}, 'eng-fra': {'num_samples': 37208, 'number_of_characters': 4091513, 'unique_pairs': 37208, 'min_sentence1_length': 8, 'average_sentence1_length': 49.11, 'max_sentence1_length': 414, 'unique_sentence1': 37208, 'min_sentence2_length': 10, 'average_sentence2_length': 60.86, 'max_sentence2_length': 428, 'unique_sentence2': 37208}, 'eng-heb': {'num_samples': 882, 'number_of_characters': 88397, 'unique_pairs': 882, 'min_sentence1_length': 13, 'average_sentence1_length': 54.84, 'max_sentence1_length': 193, 'unique_sentence1': 882, 'min_sentence2_length': 10, 'average_sentence2_length': 45.38, 'max_sentence2_length': 162, 'unique_sentence2': 882}, 'eng-hin': {'num_samples': 2219, 'number_of_characters': 227451, 'unique_pairs': 2219, 'min_sentence1_length': 12, 'average_sentence1_length': 49.35, 'max_sentence1_length': 222, 'unique_sentence1': 2219, 'min_sentence2_length': 12, 'average_sentence2_length': 53.15, 'max_sentence2_length': 281, 'unique_sentence2': 2219}, 'eng-hrv': {'num_samples': 336, 'number_of_characters': 35675, 'unique_pairs': 336, 'min_sentence1_length': 11, 'average_sentence1_length': 52.62, 'max_sentence1_length': 264, 'unique_sentence1': 336, 'min_sentence2_length': 16, 'average_sentence2_length': 53.56, 'max_sentence2_length': 258, 'unique_sentence2': 336}, 'eng-hun': {'num_samples': 2185, 'number_of_characters': 225323, 'unique_pairs': 2185, 'min_sentence1_length': 8, 'average_sentence1_length': 50.01, 'max_sentence1_length': 238, 'unique_sentence1': 2185, 'min_sentence2_length': 9, 'average_sentence2_length': 53.11, 'max_sentence2_length': 233, 'unique_sentence2': 2185}, 'eng-ind': {'num_samples': 3454, 'number_of_characters': 364799, 'unique_pairs': 3454, 'min_sentence1_length': 11, 'average_sentence1_length': 49.18, 'max_sentence1_length': 222, 'unique_sentence1': 3454, 'min_sentence2_length': 11, 'average_sentence2_length': 56.43, 'max_sentence2_length': 245, 'unique_sentence2': 3454}, 'eng-isl': {'num_samples': 358, 'number_of_characters': 33431, 'unique_pairs': 358, 'min_sentence1_length': 13, 'average_sentence1_length': 46.72, 'max_sentence1_length': 136, 'unique_sentence1': 358, 'min_sentence2_length': 13, 'average_sentence2_length': 46.66, 'max_sentence2_length': 122, 'unique_sentence2': 358}, 'eng-ita': {'num_samples': 19661, 'number_of_characters': 2063797, 'unique_pairs': 19661, 'min_sentence1_length': 10, 'average_sentence1_length': 49.52, 'max_sentence1_length': 365, 'unique_sentence1': 19661, 'min_sentence2_length': 10, 'average_sentence2_length': 55.45, 'max_sentence2_length': 362, 'unique_sentence2': 19661}, 'eng-jpn': {'num_samples': 3807, 'number_of_characters': 318641, 'unique_pairs': 3807, 'min_sentence1_length': 12, 'average_sentence1_length': 51.45, 'max_sentence1_length': 743, 'unique_sentence1': 3807, 'min_sentence2_length': 10, 'average_sentence2_length': 32.25, 'max_sentence2_length': 414, 'unique_sentence2': 3807}, 'eng-kaz': {'num_samples': 346, 'number_of_characters': 32798, 'unique_pairs': 346, 'min_sentence1_length': 15, 'average_sentence1_length': 45.54, 'max_sentence1_length': 110, 'unique_sentence1': 346, 'min_sentence2_length': 18, 'average_sentence2_length': 49.25, 'max_sentence2_length': 122, 'unique_sentence2': 346}, 'eng-kor': {'num_samples': 2558, 'number_of_characters': 217654, 'unique_pairs': 2558, 'min_sentence1_length': 12, 'average_sentence1_length': 51.78, 'max_sentence1_length': 252, 'unique_sentence1': 2558, 'min_sentence2_length': 10, 'average_sentence2_length': 33.31, 'max_sentence2_length': 225, 'unique_sentence2': 2558}, 'eng-lav': {'num_samples': 1079, 'number_of_characters': 103672, 'unique_pairs': 1079, 'min_sentence1_length': 12, 'average_sentence1_length': 47.47, 'max_sentence1_length': 165, 'unique_sentence1': 1079, 'min_sentence2_length': 11, 'average_sentence2_length': 48.61, 'max_sentence2_length': 151, 'unique_sentence2': 1079}, 'eng-lit': {'num_samples': 1185, 'number_of_characters': 113428, 'unique_pairs': 1185, 'min_sentence1_length': 12, 'average_sentence1_length': 47.12, 'max_sentence1_length': 175, 'unique_sentence1': 1185, 'min_sentence2_length': 12, 'average_sentence2_length': 48.6, 'max_sentence2_length': 167, 'unique_sentence2': 1185}, 'eng-mar': {'num_samples': 280, 'number_of_characters': 26641, 'unique_pairs': 280, 'min_sentence1_length': 14, 'average_sentence1_length': 47.66, 'max_sentence1_length': 148, 'unique_sentence1': 280, 'min_sentence2_length': 14, 'average_sentence2_length': 47.49, 'max_sentence2_length': 143, 'unique_sentence2': 280}, 'eng-msa': {'num_samples': 469, 'number_of_characters': 59862, 'unique_pairs': 469, 'min_sentence1_length': 15, 'average_sentence1_length': 61.27, 'max_sentence1_length': 163, 'unique_sentence1': 469, 'min_sentence2_length': 14, 'average_sentence2_length': 66.37, 'max_sentence2_length': 170, 'unique_sentence2': 469}, 'eng-nld': {'num_samples': 15613, 'number_of_characters': 1555910, 'unique_pairs': 15613, 'min_sentence1_length': 10, 'average_sentence1_length': 47.86, 'max_sentence1_length': 312, 'unique_sentence1': 15613, 'min_sentence2_length': 10, 'average_sentence2_length': 51.79, 'max_sentence2_length': 337, 'unique_sentence2': 15613}, 'eng-nor': {'num_samples': 2666, 'number_of_characters': 263032, 'unique_pairs': 2666, 'min_sentence1_length': 11, 'average_sentence1_length': 49.04, 'max_sentence1_length': 287, 'unique_sentence1': 2666, 'min_sentence2_length': 11, 'average_sentence2_length': 49.63, 'max_sentence2_length': 291, 'unique_sentence2': 2666}, 'eng-pol': {'num_samples': 6868, 'number_of_characters': 682796, 'unique_pairs': 6868, 'min_sentence1_length': 8, 'average_sentence1_length': 47.91, 'max_sentence1_length': 313, 'unique_sentence1': 6868, 'min_sentence2_length': 9, 'average_sentence2_length': 51.51, 'max_sentence2_length': 325, 'unique_sentence2': 6868}, 'eng-por': {'num_samples': 12406, 'number_of_characters': 1237980, 'unique_pairs': 12406, 'min_sentence1_length': 10, 'average_sentence1_length': 47.42, 'max_sentence1_length': 743, 'unique_sentence1': 12406, 'min_sentence2_length': 10, 'average_sentence2_length': 52.37, 'max_sentence2_length': 761, 'unique_sentence2': 12406}, 'eng-ron': {'num_samples': 3039, 'number_of_characters': 322632, 'unique_pairs': 3039, 'min_sentence1_length': 8, 'average_sentence1_length': 50.61, 'max_sentence1_length': 586, 'unique_sentence1': 3039, 'min_sentence2_length': 9, 'average_sentence2_length': 55.55, 'max_sentence2_length': 699, 'unique_sentence2': 3039}, 'eng-rus': {'num_samples': 9360, 'number_of_characters': 983153, 'unique_pairs': 9360, 'min_sentence1_length': 10, 'average_sentence1_length': 51.39, 'max_sentence1_length': 607, 'unique_sentence1': 9360, 'min_sentence2_length': 10, 'average_sentence2_length': 53.65, 'max_sentence2_length': 773, 'unique_sentence2': 9360}, 'eng-slk': {'num_samples': 1823, 'number_of_characters': 178649, 'unique_pairs': 1823, 'min_sentence1_length': 12, 'average_sentence1_length': 49.06, 'max_sentence1_length': 637, 'unique_sentence1': 1823, 'min_sentence2_length': 10, 'average_sentence2_length': 48.94, 'max_sentence2_length': 597, 'unique_sentence2': 1823}, 'eng-slv': {'num_samples': 1450, 'number_of_characters': 151964, 'unique_pairs': 1450, 'min_sentence1_length': 12, 'average_sentence1_length': 51.28, 'max_sentence1_length': 287, 'unique_sentence1': 1450, 'min_sentence2_length': 11, 'average_sentence2_length': 53.52, 'max_sentence2_length': 252, 'unique_sentence2': 1450}, 'eng-spa': {'num_samples': 35446, 'number_of_characters': 3679410, 'unique_pairs': 35446, 'min_sentence1_length': 10, 'average_sentence1_length': 48.67, 'max_sentence1_length': 525, 'unique_sentence1': 35446, 'min_sentence2_length': 10, 'average_sentence2_length': 55.13, 'max_sentence2_length': 607, 'unique_sentence2': 35446}, 'eng-srp': {'num_samples': 303, 'number_of_characters': 29733, 'unique_pairs': 303, 'min_sentence1_length': 15, 'average_sentence1_length': 48.15, 'max_sentence1_length': 123, 'unique_sentence1': 303, 'min_sentence2_length': 10, 'average_sentence2_length': 49.98, 'max_sentence2_length': 128, 'unique_sentence2': 303}, 'eng-swe': {'num_samples': 6005, 'number_of_characters': 597122, 'unique_pairs': 6005, 'min_sentence1_length': 10, 'average_sentence1_length': 49.21, 'max_sentence1_length': 287, 'unique_sentence1': 6005, 'min_sentence2_length': 10, 'average_sentence2_length': 50.22, 'max_sentence2_length': 289, 'unique_sentence2': 6005}, 'eng-tgl': {'num_samples': 551, 'number_of_characters': 56336, 'unique_pairs': 551, 'min_sentence1_length': 14, 'average_sentence1_length': 45.61, 'max_sentence1_length': 165, 'unique_sentence1': 551, 'min_sentence2_length': 13, 'average_sentence2_length': 56.64, 'max_sentence2_length': 198, 'unique_sentence2': 551}, 'eng-tha': {'num_samples': 814, 'number_of_characters': 79610, 'unique_pairs': 814, 'min_sentence1_length': 11, 'average_sentence1_length': 50.45, 'max_sentence1_length': 544, 'unique_sentence1': 814, 'min_sentence2_length': 11, 'average_sentence2_length': 47.35, 'max_sentence2_length': 511, 'unique_sentence2': 814}, 'eng-tur': {'num_samples': 4606, 'number_of_characters': 446492, 'unique_pairs': 4606, 'min_sentence1_length': 10, 'average_sentence1_length': 47.39, 'max_sentence1_length': 287, 'unique_sentence1': 4606, 'min_sentence2_length': 10, 'average_sentence2_length': 49.54, 'max_sentence2_length': 314, 'unique_sentence2': 4606}, 'eng-ukr': {'num_samples': 3778, 'number_of_characters': 388398, 'unique_pairs': 3778, 'min_sentence1_length': 11, 'average_sentence1_length': 51.14, 'max_sentence1_length': 284, 'unique_sentence1': 3778, 'min_sentence2_length': 10, 'average_sentence2_length': 51.66, 'max_sentence2_length': 255, 'unique_sentence2': 3778}, 'eng-urd': {'num_samples': 268, 'number_of_characters': 25182, 'unique_pairs': 268, 'min_sentence1_length': 14, 'average_sentence1_length': 46.1, 'max_sentence1_length': 99, 'unique_sentence1': 268, 'min_sentence2_length': 15, 'average_sentence2_length': 47.86, 'max_sentence2_length': 107, 'unique_sentence2': 268}, 'eng-vie': {'num_samples': 1264, 'number_of_characters': 123022, 'unique_pairs': 1264, 'min_sentence1_length': 12, 'average_sentence1_length': 45.65, 'max_sentence1_length': 241, 'unique_sentence1': 1264, 'min_sentence2_length': 10, 'average_sentence2_length': 51.68, 'max_sentence2_length': 262, 'unique_sentence2': 1264}, 'eng-zho': {'num_samples': 4959, 'number_of_characters': 347349, 'unique_pairs': 4959, 'min_sentence1_length': 11, 'average_sentence1_length': 50.01, 'max_sentence1_length': 847, 'unique_sentence1': 4959, 'min_sentence2_length': 8, 'average_sentence2_length': 20.04, 'max_sentence2_length': 620, 'unique_sentence2': 4959}}}} | | [WebFAQRetrieval](https://huggingface.co/PaDaS-Lab) (Michael Dinzinger, 2025) | ['ara', 'aze', 'ben', 'bul', 'cat', 'ces', 'dan', 'deu', 'ell', 'eng', 'est', 'fas', 'fin', 'fra', 'heb', 'hin', 'hrv', 'hun', 'ind', 'isl', 'ita', 'jpn', 'kat', 'kaz', 'kor', 'lav', 'lit', 'mar', 'msa', 'nld', 'nor', 'pol', 'por', 'ron', 'rus', 'slk', 'slv', 'spa', 'sqi', 'srp', 'swe', 'tgl', 'tha', 'tur', 'ukr', 'urd', 'uzb', 'vie', 'zho'] | Retrieval | s2p | [Web, Written] | {'test': 11434472} | {'test': {'number_of_characters': 35239389, 'num_samples': 11434472, 'num_queries': 271168, 'num_documents': 11163304, 'min_document_length': 7, 'average_document_length': 1.16, 'max_document_length': 4317, 'unique_documents': 11163304, 'min_query_length': 2, 'average_query_length': 82.33, 'max_query_length': 2, 'unique_queries': 271168, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 271168, 'hf_subset_descriptive_stats': {'ara': {'number_of_characters': 746242, 'num_samples': 152664, 'num_queries': 10000, 'num_documents': 142664, 'min_document_length': 10, 'average_document_length': 3.23, 'max_document_length': 1503, 'unique_documents': 142664, 'min_query_length': 2, 'average_query_length': 28.53, 'max_query_length': 2, 'unique_queries': 10000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10000}, 'aze': {'number_of_characters': 33357, 'num_samples': 5356, 'num_queries': 487, 'num_documents': 4869, 'min_document_length': 10, 'average_document_length': 4.85, 'max_document_length': 149, 'unique_documents': 4869, 'min_query_length': 2, 'average_query_length': 20.0, 'max_query_length': 2, 'unique_queries': 487, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 487}, 'ben': {'number_of_characters': 92226, 'num_samples': 15747, 'num_queries': 1432, 'num_documents': 14315, 'min_document_length': 10, 'average_document_length': 4.44, 'max_document_length': 322, 'unique_documents': 14315, 'min_query_length': 2, 'average_query_length': 19.99, 'max_query_length': 2, 'unique_queries': 1432, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1432}, 'bul': {'number_of_characters': 248572, 'num_samples': 38208, 'num_queries': 3474, 'num_documents': 34734, 'min_document_length': 10, 'average_document_length': 5.16, 'max_document_length': 396, 'unique_documents': 34734, 'min_query_length': 2, 'average_query_length': 20.0, 'max_query_length': 2, 'unique_queries': 3474, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3474}, 'cat': {'number_of_characters': 93488, 'num_samples': 13964, 'num_queries': 1270, 'num_documents': 12694, 'min_document_length': 10, 'average_document_length': 5.36, 'max_document_length': 184, 'unique_documents': 12694, 'min_query_length': 2, 'average_query_length': 19.99, 'max_query_length': 2, 'unique_queries': 1270, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1270}, 'ces': {'number_of_characters': 481288, 'num_samples': 79539, 'num_queries': 7231, 'num_documents': 72308, 'min_document_length': 9, 'average_document_length': 4.66, 'max_document_length': 994, 'unique_documents': 72308, 'min_query_length': 2, 'average_query_length': 20.0, 'max_query_length': 2, 'unique_queries': 7231, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 7231}, 'dan': {'number_of_characters': 760053, 'num_samples': 147686, 'num_queries': 10000, 'num_documents': 137686, 'min_document_length': 10, 'average_document_length': 3.52, 'max_document_length': 1420, 'unique_documents': 137686, 'min_query_length': 2, 'average_query_length': 27.54, 'max_query_length': 2, 'unique_queries': 10000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10000}, 'deu': {'number_of_characters': 2318297, 'num_samples': 901201, 'num_queries': 10000, 'num_documents': 891201, 'min_document_length': 7, 'average_document_length': 0.6, 'max_document_length': 1191, 'unique_documents': 891201, 'min_query_length': 2, 'average_query_length': 178.24, 'max_query_length': 2, 'unique_queries': 10000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10000}, 'ell': {'number_of_characters': 295282, 'num_samples': 42368, 'num_queries': 3852, 'num_documents': 38516, 'min_document_length': 12, 'average_document_length': 5.67, 'max_document_length': 229, 'unique_documents': 38516, 'min_query_length': 2, 'average_query_length': 20.0, 'max_query_length': 2, 'unique_queries': 3852, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3852}, 'eng': {'number_of_characters': 11040979, 'num_samples': 5288725, 'num_queries': 10000, 'num_documents': 5278725, 'min_document_length': 9, 'average_document_length': 0.09, 'max_document_length': 1177, 'unique_documents': 5278725, 'min_query_length': 2, 'average_query_length': 1055.74, 'max_query_length': 2, 'unique_queries': 10000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10000}, 'est': {'number_of_characters': 90781, 'num_samples': 14181, 'num_queries': 1290, 'num_documents': 12891, 'min_document_length': 10, 'average_document_length': 5.04, 'max_document_length': 177, 'unique_documents': 12891, 'min_query_length': 2, 'average_query_length': 19.99, 'max_query_length': 2, 'unique_queries': 1290, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1290}, 'fas': {'number_of_characters': 914501, 'num_samples': 236940, 'num_queries': 10000, 'num_documents': 226940, 'min_document_length': 10, 'average_document_length': 2.03, 'max_document_length': 3115, 'unique_documents': 226940, 'min_query_length': 2, 'average_query_length': 45.39, 'max_query_length': 2, 'unique_queries': 10000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10000}, 'fin': {'number_of_characters': 500237, 'num_samples': 80902, 'num_queries': 7355, 'num_documents': 73547, 'min_document_length': 9, 'average_document_length': 4.8, 'max_document_length': 294, 'unique_documents': 73547, 'min_query_length': 2, 'average_query_length': 20.0, 'max_query_length': 2, 'unique_queries': 7355, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 7355}, 'fra': {'number_of_characters': 1727184, 'num_samples': 579505, 'num_queries': 10000, 'num_documents': 569505, 'min_document_length': 8, 'average_document_length': 1.03, 'max_document_length': 489, 'unique_documents': 569505, 'min_query_length': 2, 'average_query_length': 113.9, 'max_query_length': 2, 'unique_queries': 10000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10000}, 'heb': {'number_of_characters': 232589, 'num_samples': 42847, 'num_queries': 3896, 'num_documents': 38951, 'min_document_length': 10, 'average_document_length': 3.97, 'max_document_length': 278, 'unique_documents': 38951, 'min_query_length': 2, 'average_query_length': 20.0, 'max_query_length': 2, 'unique_queries': 3896, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3896}, 'hin': {'number_of_characters': 685269, 'num_samples': 110031, 'num_queries': 10000, 'num_documents': 100031, 'min_document_length': 10, 'average_document_length': 4.85, 'max_document_length': 879, 'unique_documents': 100031, 'min_query_length': 2, 'average_query_length': 20.01, 'max_query_length': 2, 'unique_queries': 10000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10000}, 'hrv': {'number_of_characters': 38324, 'num_samples': 6100, 'num_queries': 555, 'num_documents': 5545, 'min_document_length': 11, 'average_document_length': 4.91, 'max_document_length': 132, 'unique_documents': 5545, 'min_query_length': 2, 'average_query_length': 19.98, 'max_query_length': 2, 'unique_queries': 555, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 555}, 'hun': {'number_of_characters': 319732, 'num_samples': 49824, 'num_queries': 4530, 'num_documents': 45294, 'min_document_length': 10, 'average_document_length': 5.06, 'max_document_length': 575, 'unique_documents': 45294, 'min_query_length': 2, 'average_query_length': 20.0, 'max_query_length': 2, 'unique_queries': 4530, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 4530}, 'ind': {'number_of_characters': 768010, 'num_samples': 121315, 'num_queries': 10000, 'num_documents': 111315, 'min_document_length': 8, 'average_document_length': 4.9, 'max_document_length': 850, 'unique_documents': 111315, 'min_query_length': 2, 'average_query_length': 22.26, 'max_query_length': 2, 'unique_queries': 10000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10000}, 'isl': {'number_of_characters': 37490, 'num_samples': 5256, 'num_queries': 478, 'num_documents': 4778, 'min_document_length': 12, 'average_document_length': 5.85, 'max_document_length': 149, 'unique_documents': 4778, 'min_query_length': 2, 'average_query_length': 19.99, 'max_query_length': 2, 'unique_queries': 478, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 478}, 'ita': {'number_of_characters': 1094898, 'num_samples': 267803, 'num_queries': 10000, 'num_documents': 257803, 'min_document_length': 10, 'average_document_length': 2.25, 'max_document_length': 905, 'unique_documents': 257803, 'min_query_length': 2, 'average_query_length': 51.56, 'max_query_length': 2, 'unique_queries': 10000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10000}, 'jpn': {'number_of_characters': 882722, 'num_samples': 319157, 'num_queries': 10000, 'num_documents': 309157, 'min_document_length': 10, 'average_document_length': 0.86, 'max_document_length': 645, 'unique_documents': 309157, 'min_query_length': 2, 'average_query_length': 61.83, 'max_query_length': 2, 'unique_queries': 10000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10000}, 'kat': {'number_of_characters': 16624, 'num_samples': 2646, 'num_queries': 241, 'num_documents': 2405, 'min_document_length': 13, 'average_document_length': 4.91, 'max_document_length': 125, 'unique_documents': 2405, 'min_query_length': 2, 'average_query_length': 19.96, 'max_query_length': 2, 'unique_queries': 241, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 241}, 'kaz': {'number_of_characters': 21230, 'num_samples': 3295, 'num_queries': 300, 'num_documents': 2995, 'min_document_length': 16, 'average_document_length': 5.09, 'max_document_length': 260, 'unique_documents': 2995, 'min_query_length': 2, 'average_query_length': 19.97, 'max_query_length': 2, 'unique_queries': 300, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 300}, 'kor': {'number_of_characters': 481069, 'num_samples': 112000, 'num_queries': 10000, 'num_documents': 102000, 'min_document_length': 10, 'average_document_length': 2.72, 'max_document_length': 264, 'unique_documents': 102000, 'min_query_length': 2, 'average_query_length': 20.4, 'max_query_length': 2, 'unique_queries': 10000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10000}, 'lav': {'number_of_characters': 89318, 'num_samples': 14429, 'num_queries': 1312, 'num_documents': 13117, 'min_document_length': 10, 'average_document_length': 4.81, 'max_document_length': 184, 'unique_documents': 13117, 'min_query_length': 2, 'average_query_length': 20.0, 'max_query_length': 2, 'unique_queries': 1312, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1312}, 'lit': {'number_of_characters': 126055, 'num_samples': 20200, 'num_queries': 1837, 'num_documents': 18363, 'min_document_length': 9, 'average_document_length': 4.86, 'max_document_length': 420, 'unique_documents': 18363, 'min_query_length': 2, 'average_query_length': 19.99, 'max_query_length': 2, 'unique_queries': 1837, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1837}, 'mar': {'number_of_characters': 46162, 'num_samples': 8145, 'num_queries': 741, 'num_documents': 7404, 'min_document_length': 14, 'average_document_length': 4.23, 'max_document_length': 184, 'unique_documents': 7404, 'min_query_length': 2, 'average_query_length': 19.98, 'max_query_length': 2, 'unique_queries': 741, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 741}, 'msa': {'number_of_characters': 50848, 'num_samples': 7072, 'num_queries': 643, 'num_documents': 6429, 'min_document_length': 11, 'average_document_length': 5.91, 'max_document_length': 189, 'unique_documents': 6429, 'min_query_length': 2, 'average_query_length': 20.0, 'max_query_length': 2, 'unique_queries': 643, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 643}, 'nld': {'number_of_characters': 1238345, 'num_samples': 380662, 'num_queries': 10000, 'num_documents': 370662, 'min_document_length': 9, 'average_document_length': 1.34, 'max_document_length': 1603, 'unique_documents': 370662, 'min_query_length': 2, 'average_query_length': 74.13, 'max_query_length': 2, 'unique_queries': 10000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10000}, 'nor': {'number_of_characters': 433342, 'num_samples': 69556, 'num_queries': 6324, 'num_documents': 63232, 'min_document_length': 7, 'average_document_length': 4.85, 'max_document_length': 227, 'unique_documents': 63232, 'min_query_length': 2, 'average_query_length': 20.0, 'max_query_length': 2, 'unique_queries': 6324, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 6324}, 'pol': {'number_of_characters': 902187, 'num_samples': 202515, 'num_queries': 10000, 'num_documents': 192515, 'min_document_length': 10, 'average_document_length': 2.69, 'max_document_length': 1038, 'unique_documents': 192515, 'min_query_length': 2, 'average_query_length': 38.5, 'max_query_length': 2, 'unique_queries': 10000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10000}, 'por': {'number_of_characters': 918403, 'num_samples': 219353, 'num_queries': 10000, 'num_documents': 209353, 'min_document_length': 10, 'average_document_length': 2.39, 'max_document_length': 1154, 'unique_documents': 209353, 'min_query_length': 2, 'average_query_length': 41.87, 'max_query_length': 2, 'unique_queries': 10000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10000}, 'ron': {'number_of_characters': 440199, 'num_samples': 65882, 'num_queries': 5990, 'num_documents': 59892, 'min_document_length': 10, 'average_document_length': 5.35, 'max_document_length': 1176, 'unique_documents': 59892, 'min_query_length': 2, 'average_query_length': 20.0, 'max_query_length': 2, 'unique_queries': 5990, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 5990}, 'rus': {'number_of_characters': 1328024, 'num_samples': 397504, 'num_queries': 10000, 'num_documents': 387504, 'min_document_length': 10, 'average_document_length': 1.43, 'max_document_length': 1937, 'unique_documents': 387504, 'min_query_length': 2, 'average_query_length': 77.5, 'max_query_length': 2, 'unique_queries': 10000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10000}, 'slk': {'number_of_characters': 229346, 'num_samples': 34675, 'num_queries': 3153, 'num_documents': 31522, 'min_document_length': 10, 'average_document_length': 5.28, 'max_document_length': 4317, 'unique_documents': 31522, 'min_query_length': 2, 'average_query_length': 19.99, 'max_query_length': 2, 'unique_queries': 3153, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3153}, 'slv': {'number_of_characters': 114951, 'num_samples': 17785, 'num_queries': 1617, 'num_documents': 16168, 'min_document_length': 11, 'average_document_length': 5.11, 'max_document_length': 426, 'unique_documents': 16168, 'min_query_length': 2, 'average_query_length': 20.0, 'max_query_length': 2, 'unique_queries': 1617, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1617}, 'spa': {'number_of_characters': 1758740, 'num_samples': 614661, 'num_queries': 10000, 'num_documents': 604661, 'min_document_length': 10, 'average_document_length': 0.91, 'max_document_length': 559, 'unique_documents': 604661, 'min_query_length': 2, 'average_query_length': 120.93, 'max_query_length': 2, 'unique_queries': 10000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10000}, 'sqi': {'number_of_characters': 15784, 'num_samples': 2285, 'num_queries': 208, 'num_documents': 2077, 'min_document_length': 11, 'average_document_length': 5.6, 'max_document_length': 141, 'unique_documents': 2077, 'min_query_length': 2, 'average_query_length': 19.97, 'max_query_length': 2, 'unique_queries': 208, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 208}, 'srp': {'number_of_characters': 40489, 'num_samples': 6407, 'num_queries': 583, 'num_documents': 5824, 'min_document_length': 8, 'average_document_length': 4.95, 'max_document_length': 128, 'unique_documents': 5824, 'min_query_length': 2, 'average_query_length': 19.98, 'max_query_length': 2, 'unique_queries': 583, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 583}, 'swe': {'number_of_characters': 779495, 'num_samples': 168738, 'num_queries': 10000, 'num_documents': 158738, 'min_document_length': 10, 'average_document_length': 2.91, 'max_document_length': 257, 'unique_documents': 158738, 'min_query_length': 2, 'average_query_length': 31.75, 'max_query_length': 2, 'unique_queries': 10000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10000}, 'tgl': {'number_of_characters': 27050, 'num_samples': 4067, 'num_queries': 370, 'num_documents': 3697, 'min_document_length': 13, 'average_document_length': 5.32, 'max_document_length': 198, 'unique_documents': 3697, 'min_query_length': 2, 'average_query_length': 19.98, 'max_query_length': 2, 'unique_queries': 370, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 370}, 'tha': {'number_of_characters': 311898, 'num_samples': 52172, 'num_queries': 4743, 'num_documents': 47429, 'min_document_length': 10, 'average_document_length': 4.58, 'max_document_length': 472, 'unique_documents': 47429, 'min_query_length': 2, 'average_query_length': 20.0, 'max_query_length': 2, 'unique_queries': 4743, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 4743}, 'tur': {'number_of_characters': 706899, 'num_samples': 154846, 'num_queries': 10000, 'num_documents': 144846, 'min_document_length': 7, 'average_document_length': 2.88, 'max_document_length': 703, 'unique_documents': 144846, 'min_query_length': 2, 'average_query_length': 28.97, 'max_query_length': 2, 'unique_queries': 10000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10000}, 'ukr': {'number_of_characters': 478281, 'num_samples': 75352, 'num_queries': 6851, 'num_documents': 68501, 'min_document_length': 10, 'average_document_length': 4.98, 'max_document_length': 1015, 'unique_documents': 68501, 'min_query_length': 2, 'average_query_length': 20.0, 'max_query_length': 2, 'unique_queries': 6851, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 6851}, 'urd': {'number_of_characters': 18987, 'num_samples': 3053, 'num_queries': 278, 'num_documents': 2775, 'min_document_length': 19, 'average_document_length': 4.84, 'max_document_length': 108, 'unique_documents': 2775, 'min_query_length': 2, 'average_query_length': 19.96, 'max_query_length': 2, 'unique_queries': 278, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 278}, 'uzb': {'number_of_characters': 8789, 'num_samples': 1390, 'num_queries': 127, 'num_documents': 1263, 'min_document_length': 16, 'average_document_length': 4.96, 'max_document_length': 110, 'unique_documents': 1263, 'min_query_length': 2, 'average_query_length': 19.89, 'max_query_length': 2, 'unique_queries': 127, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 127}, 'vie': {'number_of_characters': 787166, 'num_samples': 133972, 'num_queries': 10000, 'num_documents': 123972, 'min_document_length': 9, 'average_document_length': 4.35, 'max_document_length': 1061, 'unique_documents': 123972, 'min_query_length': 2, 'average_query_length': 24.79, 'max_query_length': 2, 'unique_queries': 10000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10000}, 'zho': {'number_of_characters': 468187, 'num_samples': 142491, 'num_queries': 10000, 'num_documents': 132491, 'min_document_length': 7, 'average_document_length': 1.53, 'max_document_length': 671, 'unique_documents': 132491, 'min_query_length': 2, 'average_query_length': 26.5, 'max_query_length': 2, 'unique_queries': 10000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10000}}}} | @@ -892,15 +1006,16 @@ The following tables give you an overview of the tasks in MTEB. | [WebQAT2TRetrieval](https://openaccess.thecvf.com/content/CVPR2022/html/Chang_WebQA_Multihop_and_Multimodal_QA_CVPR_2022_paper.html) (Chang et al., 2022) | ['eng'] | Any2AnyRetrieval | t2t | [Encyclopaedic] | {'test': 546912} | {'test': {'number_of_characters': 126324224, 'num_samples': 546912, 'num_queries': 2455, 'num_documents': 544457, 'min_document_length': 57, 'average_document_length': 231.52, 'max_document_length': 959, 'unique_documents': 544457, 'num_document_images': 0, 'min_query_length': 26, 'average_query_length': 109.95, 'max_query_length': 432, 'unique_queries': 2455, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 2.04, 'max_relevant_docs_per_query': 4, 'unique_relevant_docs': 4862}} | | [WikiCitiesClustering](https://huggingface.co/datasets/wikipedia) | ['eng'] | Clustering | p2p | [Encyclopaedic, Written] | None | None | | [WikiClusteringP2P.v2](https://github.com/Rysias/wiki-clustering) | ['bos', 'cat', 'ces', 'dan', 'eus', 'glv', 'ilo', 'kur', 'lav', 'min', 'mlt', 'sco', 'sqi', 'wln'] | Clustering | p2p | [Encyclopaedic, Written] | None | None | -| [WikipediaBioMetChemClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikiSQLRetrieval](https://huggingface.co/datasets/embedding-benchmark/WikiSQL_mteb) (Zhong et al., 2017) | ['eng', 'sql'] | Retrieval | s2s | [Programming] | {'test': 4096} | {'test': {'number_of_characters': 3293525, 'num_samples': 4096, 'num_queries': 2048, 'num_documents': 2048, 'min_document_length': 540, 'average_document_length': 1606.17, 'max_document_length': 5833, 'unique_documents': 2048, 'min_query_length': 2, 'average_query_length': 2.0, 'max_query_length': 2, 'unique_queries': 2048, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2048}} | +| [WikipediaBioMetChemClassification.v2](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | | [WikipediaBiolumNeurochemClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | | [WikipediaChemEngSpecialtiesClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | -| [WikipediaChemFieldsClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaChemFieldsClassification.v2](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | | [WikipediaChemistryTopicsClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | | [WikipediaChemistryTopicsClustering](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Clustering | s2p | [Chemistry] | None | None | -| [WikipediaCompChemSpectroscopyClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaCompChemSpectroscopyClassification.v2](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | | [WikipediaCryobiologySeparationClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | -| [WikipediaCrystallographyAnalyticalClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaCrystallographyAnalyticalClassification.v2](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | | [WikipediaGreenhouseEnantiopureClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | | [WikipediaIsotopesFissionClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | | [WikipediaLuminescenceClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | @@ -910,10 +1025,10 @@ The following tables give you an overview of the tasks in MTEB. | [WikipediaSaltsSemiconductorsClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | | [WikipediaSolidStateColloidalClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | | [WikipediaSpecialtiesInChemistryClustering](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Clustering | s2p | [Chemistry] | None | None | -| [WikipediaTheoreticalAppliedClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaTheoreticalAppliedClassification.v2](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | | [WinoGrande](https://winogrande.allenai.org/) (Sakaguchi et al., 2021) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [Winoground](https://openaccess.thecvf.com/content/CVPR2022/html/Thrush_Winoground_Probing_Vision_and_Language_Models_for_Visio-Linguistic_Compositionality_CVPR_2022_paper) (Tristan Thrush, 2022) | ['eng'] | Compositionality | i2t | [Social] | {'test': 400} | {'test': {'num_samples': 400, 'num_images': 800, 'num_texts': 800, 'num_unique_texts': 800, 'min_text_length': 8, 'average_text_length': 45.47, 'max_text_length': 151}} | -| [WisesightSentimentClassification](https://github.com/PyThaiNLP/wisesight-sentiment) (Suriyawongkul et al., 2019) | ['tha'] | Classification | s2s | [News, Social, Written] | None | None | +| [WisesightSentimentClassification.v2](https://github.com/PyThaiNLP/wisesight-sentiment) (Suriyawongkul et al., 2019) | ['tha'] | Classification | s2s | [News, Social, Written] | None | None | | [XFlickr30kCoT2IRetrieval](https://proceedings.mlr.press/v162/bugliarello22a/bugliarello22a.pdf) (Bugliarello et al., 2022) | ['deu', 'eng', 'ind', 'jpn', 'rus', 'spa', 'tur', 'zho'] | Any2AnyMultilingualRetrieval | t2i | [Encyclopaedic, Written] | {'test': 32000} | {'test': {'number_of_characters': 1149877, 'num_samples': 32000, 'num_queries': 16000, 'num_documents': 16000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 16000, 'min_query_length': 12, 'average_query_length': 71.87, 'max_query_length': 385, 'unique_queries': 15987, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 16000, 'hf_subset_descriptive_stats': {'de': {'number_of_characters': 132154, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 4, 'average_query_length': 66.08, 'max_query_length': 220, 'unique_queries': 1994, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}, 'en': {'number_of_characters': 153801, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 34, 'average_query_length': 76.9, 'max_query_length': 377, 'unique_queries': 2000, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}, 'es': {'number_of_characters': 160049, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 23, 'average_query_length': 80.02, 'max_query_length': 342, 'unique_queries': 2000, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}, 'id': {'number_of_characters': 167858, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 4, 'average_query_length': 83.93, 'max_query_length': 211, 'unique_queries': 2000, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}, 'ja': {'number_of_characters': 75480, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 9, 'average_query_length': 37.74, 'max_query_length': 179, 'unique_queries': 2000, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}, 'ru': {'number_of_characters': 149947, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 10, 'average_query_length': 74.97, 'max_query_length': 294, 'unique_queries': 1997, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}, 'tr': {'number_of_characters': 136134, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 19, 'average_query_length': 68.07, 'max_query_length': 199, 'unique_queries': 1997, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}, 'zh': {'number_of_characters': 46454, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 10, 'average_query_length': 23.23, 'max_query_length': 66, 'unique_queries': 1999, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}}}} | | [XGlueWPRReranking](https://github.com/microsoft/XGLUE) (Zeman et al., 2019) | ['deu', 'eng', 'fra', 'ita', 'por', 'spa', 'zho'] | Reranking | s2p | [Written] | None | None | | [XM3600T2IRetrieval](https://aclanthology.org/2022.emnlp-main.45/) (Thapliyal et al., 2022) | ['ara', 'ben', 'ces', 'dan', 'deu', 'ell', 'eng', 'fas', 'fil', 'fin', 'fra', 'heb', 'hin', 'hrv', 'hun', 'ind', 'ita', 'jpn', 'kor', 'mri', 'nld', 'nor', 'pol', 'por', 'quz', 'ron', 'rus', 'spa', 'swa', 'swe', 'tel', 'tha', 'tur', 'ukr', 'vie', 'zho'] | Any2AnyMultilingualRetrieval | t2i | [Encyclopaedic, Written] | {'test': 390975} | {'test': {'number_of_characters': 17009034, 'num_samples': 390975, 'num_queries': 261375, 'num_documents': 129600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 129600, 'min_query_length': 9, 'average_query_length': 65.08, 'max_query_length': 532, 'unique_queries': 259932, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 129600, 'hf_subset_descriptive_stats': {'ar': {'number_of_characters': 310802, 'num_samples': 10967, 'num_queries': 7367, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 8, 'average_query_length': 42.19, 'max_query_length': 208, 'unique_queries': 7339, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'bn': {'number_of_characters': 223622, 'num_samples': 7200, 'num_queries': 3600, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 28, 'average_query_length': 62.12, 'max_query_length': 139, 'unique_queries': 3594, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'cs': {'number_of_characters': 282069, 'num_samples': 10807, 'num_queries': 7207, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 3, 'average_query_length': 39.14, 'max_query_length': 266, 'unique_queries': 6814, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'da': {'number_of_characters': 351028, 'num_samples': 10864, 'num_queries': 7264, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 7, 'average_query_length': 48.32, 'max_query_length': 158, 'unique_queries': 7246, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'de': {'number_of_characters': 660790, 'num_samples': 12243, 'num_queries': 8643, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 11, 'average_query_length': 76.45, 'max_query_length': 334, 'unique_queries': 8643, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'el': {'number_of_characters': 370363, 'num_samples': 10804, 'num_queries': 7204, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 4, 'average_query_length': 51.41, 'max_query_length': 262, 'unique_queries': 7100, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'en': {'number_of_characters': 356488, 'num_samples': 10800, 'num_queries': 7200, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 6, 'average_query_length': 49.51, 'max_query_length': 148, 'unique_queries': 7129, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'es': {'number_of_characters': 485004, 'num_samples': 12214, 'num_queries': 8614, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 12, 'average_query_length': 56.3, 'max_query_length': 179, 'unique_queries': 8605, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'fa': {'number_of_characters': 430055, 'num_samples': 10845, 'num_queries': 7245, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 4, 'average_query_length': 59.36, 'max_query_length': 289, 'unique_queries': 7242, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'fi': {'number_of_characters': 464334, 'num_samples': 10727, 'num_queries': 7127, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 1, 'average_query_length': 65.15, 'max_query_length': 336, 'unique_queries': 7110, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'fil': {'number_of_characters': 480287, 'num_samples': 10709, 'num_queries': 7109, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 4, 'average_query_length': 67.56, 'max_query_length': 332, 'unique_queries': 7016, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'fr': {'number_of_characters': 595836, 'num_samples': 12162, 'num_queries': 8562, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 15, 'average_query_length': 69.59, 'max_query_length': 173, 'unique_queries': 8560, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'he': {'number_of_characters': 457775, 'num_samples': 10800, 'num_queries': 7200, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 3, 'average_query_length': 63.58, 'max_query_length': 453, 'unique_queries': 7190, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'hi': {'number_of_characters': 509092, 'num_samples': 12103, 'num_queries': 8503, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 3, 'average_query_length': 59.87, 'max_query_length': 188, 'unique_queries': 8422, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'hr': {'number_of_characters': 420595, 'num_samples': 10880, 'num_queries': 7280, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 3, 'average_query_length': 57.77, 'max_query_length': 271, 'unique_queries': 7224, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'hu': {'number_of_characters': 436677, 'num_samples': 10816, 'num_queries': 7216, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 5, 'average_query_length': 60.52, 'max_query_length': 393, 'unique_queries': 7209, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'id': {'number_of_characters': 666387, 'num_samples': 10726, 'num_queries': 7126, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 16, 'average_query_length': 93.51, 'max_query_length': 286, 'unique_queries': 7125, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'it': {'number_of_characters': 608604, 'num_samples': 12071, 'num_queries': 8471, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 15, 'average_query_length': 71.85, 'max_query_length': 201, 'unique_queries': 8470, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'ja': {'number_of_characters': 186672, 'num_samples': 10785, 'num_queries': 7185, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 1, 'average_query_length': 25.98, 'max_query_length': 97, 'unique_queries': 7175, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'ko': {'number_of_characters': 188812, 'num_samples': 11250, 'num_queries': 7650, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 1, 'average_query_length': 24.68, 'max_query_length': 113, 'unique_queries': 7644, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'mi': {'number_of_characters': 262800, 'num_samples': 8332, 'num_queries': 4732, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 7, 'average_query_length': 55.54, 'max_query_length': 304, 'unique_queries': 4707, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'nl': {'number_of_characters': 370231, 'num_samples': 11659, 'num_queries': 8059, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 4, 'average_query_length': 45.94, 'max_query_length': 173, 'unique_queries': 8004, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'no': {'number_of_characters': 391381, 'num_samples': 10813, 'num_queries': 7213, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 5, 'average_query_length': 54.26, 'max_query_length': 162, 'unique_queries': 7191, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'pl': {'number_of_characters': 411189, 'num_samples': 10741, 'num_queries': 7141, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 4, 'average_query_length': 57.58, 'max_query_length': 226, 'unique_queries': 7117, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'pt': {'number_of_characters': 446873, 'num_samples': 10843, 'num_queries': 7243, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 10, 'average_query_length': 61.7, 'max_query_length': 324, 'unique_queries': 7220, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'quz': {'number_of_characters': 278263, 'num_samples': 10800, 'num_queries': 7200, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 2, 'average_query_length': 38.65, 'max_query_length': 234, 'unique_queries': 7130, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'ro': {'number_of_characters': 629977, 'num_samples': 10723, 'num_queries': 7123, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 5, 'average_query_length': 88.44, 'max_query_length': 524, 'unique_queries': 7122, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'ru': {'number_of_characters': 477558, 'num_samples': 10800, 'num_queries': 7200, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 11, 'average_query_length': 66.33, 'max_query_length': 232, 'unique_queries': 7194, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'sv': {'number_of_characters': 339400, 'num_samples': 10873, 'num_queries': 7273, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 6, 'average_query_length': 46.67, 'max_query_length': 174, 'unique_queries': 7199, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'sw': {'number_of_characters': 444085, 'num_samples': 10646, 'num_queries': 7046, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 2, 'average_query_length': 63.03, 'max_query_length': 299, 'unique_queries': 7014, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'te': {'number_of_characters': 341340, 'num_samples': 10800, 'num_queries': 7200, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 16, 'average_query_length': 47.41, 'max_query_length': 132, 'unique_queries': 7062, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'th': {'number_of_characters': 344730, 'num_samples': 10800, 'num_queries': 7200, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 8, 'average_query_length': 47.88, 'max_query_length': 147, 'unique_queries': 7170, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'tr': {'number_of_characters': 458639, 'num_samples': 10833, 'num_queries': 7233, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 7, 'average_query_length': 63.41, 'max_query_length': 453, 'unique_queries': 7224, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'uk': {'number_of_characters': 474311, 'num_samples': 10815, 'num_queries': 7215, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 10, 'average_query_length': 65.74, 'max_query_length': 372, 'unique_queries': 7206, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'vi': {'number_of_characters': 582546, 'num_samples': 10950, 'num_queries': 7350, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 6, 'average_query_length': 79.26, 'max_query_length': 287, 'unique_queries': 7350, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'zh': {'number_of_characters': 165110, 'num_samples': 10774, 'num_queries': 7174, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 1, 'average_query_length': 23.02, 'max_query_length': 96, 'unique_queries': 7165, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}}}} | @@ -923,9 +1038,9 @@ The following tables give you an overview of the tasks in MTEB. | [XPQARetrieval](https://arxiv.org/abs/2305.09249) (Shen et al., 2023) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'jpn', 'kor', 'pol', 'por', 'spa', 'tam'] | Retrieval | s2p | [Reviews, Written] | None | None | | [XQuADRetrieval](https://huggingface.co/datasets/xquad) (Mikel Artetxe, 2019) | ['arb', 'deu', 'ell', 'eng', 'hin', 'ron', 'rus', 'spa', 'tha', 'tur', 'vie', 'zho'] | Retrieval | s2p | [Web, Written] | None | None | | [XStance](https://github.com/ZurichNLP/xstance) (Vamvas et al., 2020) | ['deu', 'fra', 'ita'] | PairClassification | s2s | [Social, Written] | None | None | -| [YahooAnswersTopicsClassification](https://huggingface.co/datasets/yahoo_answers_topics) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Web, Written] | None | None | -| [YelpReviewFullClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Reviews, Written] | None | None | -| [YueOpenriceReviewClassification](https://github.com/Christainx/Dataset_Cantonese_Openrice) (Xiang et al., 2019) | ['yue'] | Classification | s2s | [Reviews, Spoken] | None | None | +| [YahooAnswersTopicsClassification.v2](https://huggingface.co/datasets/yahoo_answers_topics) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Web, Written] | None | None | +| [YelpReviewFullClassification.v2](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Reviews, Written] | None | None | +| [YueOpenriceReviewClassification.v2](https://github.com/Christainx/Dataset_Cantonese_Openrice) (Xiang et al., 2019) | ['yue'] | Classification | s2s | [Reviews, Spoken] | None | None | | [ZacLegalTextRetrieval](https://challenge.zalo.ai/) | ['vie'] | Retrieval | s2p | [Legal] | None | None | | [indonli](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) (Mahendra et al., 2021) | ['ind'] | PairClassification | s2s | [Encyclopaedic, News, Web, Written] | None | None | | [mFollowIRCrossLingualInstructionRetrieval](https://neuclir.github.io/) (Weller et al., 2024) | ['eng', 'fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'test': 121758} | {'test': {'num_samples': 121758, 'num_docs': 121635, 'num_queries': 123, 'number_of_characters': 283654099, 'min_document_length': 74, 'average_document_length': 2331.08, 'max_document_length': 24179, 'unique_docs': 121635, 'min_query_length': 32, 'average_query_length': 81.88, 'max_query_length': 173, 'unique_queries': 75, 'min_instruction_length': 93, 'average_instruction_length': 389.95, 'max_instruction_length': 887, 'unique_instructions': 75, 'min_changed_instruction_length': 180, 'average_changed_instruction_length': 450.55, 'max_changed_instruction_length': 974, 'unique_changed_instructions': 123, 'min_average_relevant_docs_per_query': 0, 'average_relevant_docs_per_query': 10.43, 'max_average_relevant_docs_per_query': 24, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000, 'hf_subset_descriptive_stats': {'eng-fas': {'num_samples': 41229, 'num_docs': 41189, 'num_queries': 40, 'number_of_characters': 129597567, 'min_document_length': 99, 'average_document_length': 3145.5, 'max_document_length': 24179, 'unique_docs': 41189, 'min_query_length': 34, 'average_query_length': 80.08, 'max_query_length': 124, 'unique_queries': 40, 'min_instruction_length': 150, 'average_instruction_length': 396.88, 'max_instruction_length': 887, 'unique_instructions': 40, 'min_changed_instruction_length': 205, 'average_changed_instruction_length': 463.18, 'max_changed_instruction_length': 974, 'unique_changed_instructions': 40, 'min_average_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 10.85, 'max_average_relevant_docs_per_query': 22, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}, 'eng-rus': {'num_samples': 39366, 'num_docs': 39326, 'num_queries': 40, 'number_of_characters': 109522175, 'min_document_length': 75, 'average_document_length': 2784.08, 'max_document_length': 24061, 'unique_docs': 39326, 'min_query_length': 32, 'average_query_length': 81.88, 'max_query_length': 173, 'unique_queries': 40, 'min_instruction_length': 93, 'average_instruction_length': 371.12, 'max_instruction_length': 887, 'unique_instructions': 40, 'min_changed_instruction_length': 180, 'average_changed_instruction_length': 431.8, 'max_changed_instruction_length': 957, 'unique_changed_instructions': 40, 'min_average_relevant_docs_per_query': 0, 'average_relevant_docs_per_query': 9.78, 'max_average_relevant_docs_per_query': 24, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}, 'eng-zho': {'num_samples': 41163, 'num_docs': 41120, 'num_queries': 43, 'number_of_characters': 44534357, 'min_document_length': 74, 'average_document_length': 1082.05, 'max_document_length': 23840, 'unique_docs': 41120, 'min_query_length': 32, 'average_query_length': 83.56, 'max_query_length': 159, 'unique_queries': 43, 'min_instruction_length': 157, 'average_instruction_length': 401.02, 'max_instruction_length': 731, 'unique_instructions': 43, 'min_changed_instruction_length': 209, 'average_changed_instruction_length': 456.26, 'max_changed_instruction_length': 822, 'unique_changed_instructions': 43, 'min_average_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 10.65, 'max_average_relevant_docs_per_query': 24, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}}}} | @@ -943,1061 +1058,1061 @@ The following tables give you an overview of the tasks in MTEB.
-| ISO Code | Language | Family | Any2AnyMultiChoice | Any2AnyMultilingualRetrieval | Any2AnyRetrieval | BitextMining | Classification | Clustering | Compositionality | DocumentUnderstanding | ImageClassification | ImageClustering | ImageMultilabelClassification | InstructionRetrieval | MultilabelClassification | PairClassification | Reranking | Retrieval | STS | Speed | Summarization | VisionCentricQA | VisualSTS(eng) | VisualSTS(multi) | ZeroShotClassification | Sum | -|----------|----------|--------|--------------------|------------------------------|------------------|--------------|----------------|------------|------------------|-----------------------|---------------------|-----------------|-------------------------------|----------------------|--------------------------|--------------------|-----------|-----------|-----|-------|---------------|-----------------|----------------|------------------|------------------------|-----| -| aai | Arifama-Miniafia | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aak | Ankave | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aau | Abau | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aaz | Amarasi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| abs | Ambonese Malay | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| abt | Ambulas | Ndu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| abx | Inabaknon | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aby | Aneme Wake | Yareban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ace | Achinese | Austronesian | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| acf | Saint Lucian Creole French | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| acm | Mesopotamian Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| acq | Ta'izzi-Adeni Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| acr | Achi | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| acu | Achuar-Shiwiar | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| adz | Adzera | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aeb | Tunisian Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| aer | Eastern Arrernte | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aey | Amele | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| afr | Afrikaans | Indo-European | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| agd | Agarabi | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agg | Angor | Senagi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agm | Angaataha | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agn | Agutaynen | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agr | Aguaruna | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agt | Central Cagayan Agta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agu | Aguacateco | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aia | Arosi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aii | Assyrian Neo-Aramaic | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ajp | South Levantine Arabic | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| aka | Akan | Atlantic-Congo | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ake | Akawaio | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| alp | Alune | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| alq | Algonquin | Algic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| als | Tosk Albanian | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| aly | Alyawarr | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ame | Yanesha' | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amf | Hamer-Banna | South Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amh | Amharic | Afro-Asiatic | 0 | 0 | 0 | 3 | 6 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | -| amk | Ambai | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amm | Ama (Papua New Guinea) | Left May | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amn | Amanab | Border | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amo | Amo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amp | Alamblak | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amr | Amarakaeri | Harakmbut | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amu | Guerrero Amuzgo | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amx | Anmatyerre | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ang | Old English (ca. 450-1100) | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| anh | Nend | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| anp | Angika | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| anv | Denya | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aoi | Anindilyakwa | Gunwinyguan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aoj | Mufian | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aom | Ömie | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aon | Bumbita Arapesh | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apb | Sa'a | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apc | Levantine Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ape | Bukiyip | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apn | Apinayé | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apr | Arop-Lokep | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apu | Apurinã | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apw | Western Apache | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apz | Safeyoka | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ara | Arabic | Unclassified | 0 | 2 | 0 | 4 | 12 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 11 | 2 | 0 | 0 | 0 | 0 | 2 | 0 | 38 | -| arb | Standard Arabic | Afro-Asiatic | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| are | Western Arrarnta | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| arl | Arabela | Zaparoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| arn | Mapudungun | Araucanian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| arp | Arapaho | Algic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| arq | Algerian Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ars | Najdi Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ary | Moroccan Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| arz | Egyptian Arabic | Afro-Asiatic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| asm | Assamese | Indo-European | 0 | 0 | 0 | 5 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | -| aso | Dano | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ast | Asturian | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ata | Pele-Ata | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| atb | Zaiwa | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| atd | Ata Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| atg | Ivbie North-Okpela-Arhe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| att | Pamplona Atta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| auc | Waorani | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aui | Anuki | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| auy | Awiyaana | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| avt | Au | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| awa | Awadhi | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| awb | Awa (Papua New Guinea) | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| awk | Awabakal | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| awx | Awara | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ayr | Central Aymara | Aymaran | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| azb | South Azerbaijani | Turkic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| aze | Azerbaijani | Unclassified | 0 | 0 | 0 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| azg | San Pedro Amuzgos Amuzgo | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| azj | North Azerbaijani | Turkic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| azz | Highland Puebla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bak | Bashkir | Turkic | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| bam | Bambara | Mande | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| ban | Balinese | Austronesian | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| bao | Waimaha | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bba | Baatonum | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bbb | Barai | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bbc | Batak Toba | Austronesian | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| bbr | Girawa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bch | Bariai | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bco | Kaluli | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bdd | Bunama | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bea | Beaver | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bef | Benabena | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bel | Belarusian | Indo-European | 0 | 0 | 0 | 4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| bem | Bemba (Zambia) | Atlantic-Congo | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ben | Bengali | Indo-European | 0 | 1 | 0 | 9 | 9 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 33 | -| beo | Beami | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ber | Berber (Other) | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| beu | Blagar | Timor-Alor-Pantar | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bew | Betawi | Austronesian | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| bgc | Haryanvi | Indo-European | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| bgs | Tagabawa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bgt | Bughotu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bhb | Bhili | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bhd | Bhadrawahi | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bhg | Binandere | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bhl | Bimin | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bho | Bhojpuri | Indo-European | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| bhp | Bima | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| big | Biangai | Kunimaipan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjj | Kanauji | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjk | Barok | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjn | Banjar | Austronesian | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| bjp | Fanamaket | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjr | Binumarien | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjv | Bedjond | Central Sudanic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjz | Baruga | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bkd | Binukid | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bki | Baki | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bkq | Bakairí | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bkx | Baikeno | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| blw | Balangao | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| blz | Balantak | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bmh | Kein | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bmk | Ghayavi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bmr | Muinane | Boran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bmu | Somba-Siawari | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bnp | Bola | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bns | Bundeli | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| boa | Bora | Boran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bod | Tibetan | Sino-Tibetan | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| boj | Anjam | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bon | Bine | Eastern Trans-Fly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bos | Bosnian | Indo-European | 0 | 0 | 0 | 3 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| box | Buamu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| boy | Bodo (Central African Republic) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bpr | Koronadal Blaan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bps | Sarangani Blaan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bqc | Boko (Benin) | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bqp | Busa | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bra | Braj | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bre | Breton | Indo-European | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| brx | Bodo (India) | Sino-Tibetan | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| bsj | Bangwinji | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bsn | Barasana-Eduria | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bsp | Baga Sitemu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bss | Akoose | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bug | Buginese | Austronesian | 0 | 0 | 0 | 2 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| buk | Bugawac | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bul | Bulgarian | Indo-European | 0 | 1 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 17 | -| bus | Bokobaru | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bvd | Baeggu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bvr | Burarra | Maningrida | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bxh | Buhutu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| byr | Baruya | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| byx | Qaqet | Baining | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bzd | Bribri | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bzh | Mapos Buang | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bzj | Belize Kriol English | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| caa | Chortí | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cab | Garifuna | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cac | Chuj | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| caf | Southern Carrier | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cak | Kaqchikel | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cao | Chácobo | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cap | Chipaya | Uru-Chipaya | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| car | Galibi Carib | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cat | Catalan | Indo-European | 0 | 0 | 0 | 5 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | -| cav | Cavineña | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cax | Chiquitano | Chiquitano | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbc | Carapana | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbi | Chachi | Barbacoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbk | Chavacano | Indo-European | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| cbr | Cashibo-Cacataibo | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbs | Cashinahua | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbt | Chayahuita | Cahuapanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbu | Candoshi-Shapra | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbv | Cacua | Kakua-Nukak | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cco | Comaltepec Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ceb | Cebuano | Austronesian | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| cek | Eastern Khumi Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ces | Czech | Indo-European | 0 | 1 | 0 | 6 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | -| cgc | Kagayanen | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cha | Chamorro | Austronesian | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| chd | Highland Oaxaca Chontal | Tequistlatecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chf | Tabasco Chontal | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chk | Chuukese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chq | Quiotepec Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chv | Chuvash | Turkic | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chz | Ozumacín Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cjk | Chokwe | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| cjo | Ashéninka Pajonal | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cjv | Chuave | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ckb | Central Kurdish | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| cle | Lealao Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| clu | Caluyanun | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cme | Cerma | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cmn | Mandarin Chinese | Sino-Tibetan | 0 | 0 | 0 | 4 | 10 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 4 | 10 | 9 | 0 | 0 | 0 | 0 | 2 | 0 | 46 | -| cmo | Central Mnong | Austroasiatic | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| cni | Asháninka | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cnl | Lalana Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cnt | Tepetotutla Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| code | unknown | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 37 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 41 | -| cof | Colorado | Barbacoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| con | Cofán | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cop | Coptic | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cor | Cornish | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cot | Caquinte | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpa | Palantla Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpb | Ucayali-Yurúa Ashéninka | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpc | Ajyíninka Apurucayali | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpu | Pichis Ashéninka | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpy | South Ucayali Ashéninka | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| crh | Crimean Tatar | Turkic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| crn | El Nayar Cora | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| crx | Carrier | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| csb | Kashubian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cso | Sochiapam Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| csy | Siyin Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cta | Tataltepec Chatino | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cth | Thaiphum Chin | Bookkeeping | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ctp | Western Highland Chatino | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ctu | Chol | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cub | Cubeo | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cuc | Usila Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cui | Cuiba | Guahiboan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cuk | San Blas Kuna | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cut | Teutila Cuicatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cux | Tepeuxila Cuicatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cwe | Kwere | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cya | Nopala Chatino | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cym | Welsh | Indo-European | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| daa | Dangaléat | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dad | Marik | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dah | Gwahatike | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dan | Danish | Indo-European | 0 | 2 | 0 | 8 | 10 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 32 | -| ded | Dedua | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| deu | German | Indo-European | 0 | 2 | 0 | 8 | 14 | 7 | 0 | 4 | 0 | 0 | 0 | 0 | 1 | 7 | 3 | 21 | 4 | 0 | 0 | 0 | 0 | 4 | 0 | 75 | -| dgc | Casiguran Dumagat Agta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dgr | Dogrib | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dgz | Daga | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dhg | Dhangu-Djangu | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dif | Dieri | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dik | Southwestern Dinka | Nilotic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| div | Dhivehi | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dji | Djinang | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| djk | Eastern Maroon Creole | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| djr | Djambarrpuyngu | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dob | Dobu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| doi | Dogri (macrolanguage) | Unclassified | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| dop | Lukpa | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dov | Dombe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dsb | Lower Sorbian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dtp | Kadazan Dusun | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dwr | Dawro | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dww | Dawawa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dwy | Dhuwaya | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dyu | Dyula | Mande | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| dza | Tunzu | Atlantic-Congo | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dzo | Dzongkha | Sino-Tibetan | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ebk | Eastern Bontok | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eko | Koti | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ell | Modern Greek (1453-) | Indo-European | 0 | 2 | 0 | 5 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | -| emi | Mussau-Emira | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| emp | Northern Emberá | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eng | English | Indo-European | 0 | 3 | 49 | 20 | 161 | 21 | 7 | 15 | 22 | 5 | 0 | 3 | 1 | 13 | 10 | 122 | 13 | 2 | 1 | 6 | 7 | 3 | 24 | 508 | -| enq | Enga | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| epo | Esperanto | Artificial Language | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| eri | Ogea | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ese | Ese Ejja | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| esk | Northwest Alaska Inupiatun | Eskimo-Aleut | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| est | Estonian | Uralic | 0 | 1 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| etr | Edolo | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eus | Basque | Unclassified | 0 | 0 | 0 | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| ewe | Ewe | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| faa | Fasu | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fai | Faiwol | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fao | Faroese | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| far | Fataleka | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fas | Persian | Indo-European | 0 | 1 | 0 | 6 | 28 | 5 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 | 2 | 41 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 95 | -| ffm | Maasina Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fij | Fijian | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| fil | Filipino | Austronesian | 0 | 1 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| fin | Finnish | Uralic | 0 | 1 | 0 | 5 | 6 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 2 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | -| fon | Fon | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| for | Fore | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fra | French | Indo-European | 0 | 1 | 0 | 9 | 13 | 8 | 0 | 4 | 0 | 0 | 0 | 0 | 1 | 6 | 4 | 18 | 4 | 0 | 1 | 0 | 0 | 4 | 0 | 73 | -| fry | Western Frisian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fuc | Pulaar | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fue | Borgu Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fuf | Pular | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fuh | Western Niger Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fur | Friulian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| fuv | Nigerian Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| gah | Alekano | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gai | Borei | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gam | Kandawo | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gaw | Nobonob | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gaz | West Central Oromo | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| gbm | Garhwali | Indo-European | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| gdn | Umanakaina | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gdr | Wipi | Eastern Trans-Fly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| geb | Kire | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gfk | Patpatar | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ghs | Guhu-Samane | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gla | Scottish Gaelic | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| gle | Irish | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| glg | Galician | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| glk | Gilaki | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| glv | Manx | Indo-European | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gmv | Gamo | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gng | Ngangam | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gnn | Gumatj | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gnw | Western Bolivian Guaraní | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gof | Gofa | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gom | Goan Konkani | Indo-European | 0 | 0 | 0 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| grc | Ancient Greek (to 1453) | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| grn | Guarani | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| gsw | Swiss German | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gub | Guajajára | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| guh | Guahibo | Guahiboan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gui | Eastern Bolivian Guaraní | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| guj | Gujarati | Indo-European | 0 | 0 | 0 | 6 | 6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | -| gul | Sea Island Creole English | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gum | Guambiano | Barbacoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gun | Mbyá Guaraní | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| guo | Guayabero | Guahiboan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gup | Gunwinggu | Gunwinyguan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gux | Gourmanchéma | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gvc | Guanano | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gvf | Golin | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gvn | Kuku-Yalanji | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gvs | Gumawana | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gwi | Gwichʼin | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gym | Ngäbere | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gyr | Guarayu | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hat | Haitian | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| hau | Hausa | Afro-Asiatic | 0 | 0 | 0 | 4 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | -| haw | Hawaiian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hbo | Ancient Hebrew | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hch | Huichol | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| heb | Hebrew | Afro-Asiatic | 0 | 1 | 0 | 6 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | -| heg | Helong | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hin | Hindi | Indo-European | 0 | 1 | 0 | 11 | 12 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 11 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 44 | -| hix | Hixkaryána | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hla | Halia | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hlt | Matu Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hmn | Hmong | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hmo | Hiri Motu | Pidgin | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hne | Chhattisgarhi | Indo-European | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| hns | Caribbean Hindustani | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hop | Hopi | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hot | Hote | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hrv | Croatian | Indo-European | 0 | 1 | 0 | 6 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | -| hsb | Upper Sorbian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hto | Minica Huitoto | Huitotoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hub | Huambisa | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hui | Huli | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hun | Hungarian | Uralic | 0 | 1 | 0 | 7 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 17 | -| hus | Huastec | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| huu | Murui Huitoto | Huitotoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| huv | San Mateo Del Mar Huave | Huavean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hvn | Sabu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hye | Armenian | Indo-European | 0 | 0 | 0 | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | -| ian | Iatmul | Ndu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ibo | Igbo | Atlantic-Congo | 0 | 0 | 0 | 3 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| ido | Ido | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ign | Ignaciano | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ikk | Ika | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ikw | Ikwere | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ile | Interlingue | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ilo | Iloko | Austronesian | 0 | 0 | 0 | 2 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| imo | Imbongu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ina | Interlingua (International Auxiliary Language Association) | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| inb | Inga | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ind | Indonesian | Austronesian | 0 | 3 | 0 | 8 | 7 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| ino | Inoke-Yate | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| iou | Tuma-Irumu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ipi | Ipili | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| isl | Icelandic | Indo-European | 0 | 0 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| isn | Isanzu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ita | Italian | Indo-European | 0 | 1 | 0 | 7 | 11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 2 | 8 | 3 | 0 | 0 | 0 | 0 | 4 | 0 | 42 | -| iws | Sepik Iwam | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ixl | Ixil | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jac | Popti' | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jae | Yabem | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jao | Yanyuwa | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jav | Javanese | Austronesian | 0 | 0 | 0 | 4 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13 | -| jic | Tol | Jicaquean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jid | Bu (Kaduna State) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jiv | Shuar | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jni | Janji | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jpn | Japanese | Japonic | 0 | 3 | 0 | 7 | 8 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 5 | 21 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 52 | -| jvn | Caribbean Javanese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kab | Kabyle | Afro-Asiatic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| kac | Kachin | Sino-Tibetan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| kam | Kamba (Kenya) | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kan | Kannada | Dravidian | 0 | 0 | 0 | 6 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 19 | -| kaq | Capanahua | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kas | Kashmiri | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| kat | Georgian | Kartvelian | 0 | 0 | 0 | 6 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13 | -| kaz | Kazakh | Turkic | 0 | 0 | 0 | 5 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | -| kbc | Kadiwéu | Guaicuruan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kbh | Camsá | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kbm | Iwal | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kbp | Kabiyè | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kbq | Kamano | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kdc | Kutu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kde | Makonde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kdl | Tsikimba | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kea | Kabuverdianu | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| kek | Kekchí | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ken | Kenyang | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kew | West Kewa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kfg | Kudiya | Dravidian | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kfy | Kumaoni | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kgf | Kube | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kgk | Kaiwá | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kgp | Kaingang | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| khk | Halh Mongolian | Mongolic-Khitan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| khm | Khmer | Austroasiatic | 0 | 0 | 0 | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | -| khs | Kasua | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| khz | Keapara | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kik | Kikuyu | Atlantic-Congo | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| kin | Kinyarwanda | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| kir | Kirghiz | Turkic | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| kiw | Northeast Kiwai | Kiwaian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kiz | Kisi | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kje | Kisar | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kjs | East Kewa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kkc | Odoodee | East Strickland | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kkl | Kosarek Yale | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| klt | Nukna | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| klv | Maskelynes | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmb | Kimbundu | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kmg | Kâte | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmh | Kalam | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmk | Limos Kalinga | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmo | Kwoma | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmr | Northern Kurdish | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| kms | Kamasau | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmu | Kanite | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| knc | Central Kanuri | Saharan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kne | Kankanaey | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| knf | Mankanya | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| knj | Western Kanjobal | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| knv | Tabo | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kon | Kongo | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kor | Korean | Koreanic | 0 | 2 | 0 | 6 | 8 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 3 | 1 | 11 | 3 | 0 | 0 | 0 | 0 | 2 | 0 | 41 | -| kos | Kosraean | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpf | Komba | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpg | Kapingamarangi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpj | Karajá | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpr | Korafe-Yegha | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpw | Kobon | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpx | Mountain Koiali | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kqa | Mum | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kqc | Doromu-Koki | Manubaran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kqf | Kakabai | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kql | Kyenele | Yuat | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kqw | Kandas | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| krc | Karachay-Balkar | Turkic | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ksd | Kuanua | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ksj | Uare | Kwalean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ksr | Borong | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ktm | Kurti | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kto | Kuot | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kud | 'Auhelawa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kue | Kuman (Papua New Guinea) | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kup | Kunimaipa | Kunimaipan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kur | Kurdish | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kvg | Kuni-Boazi | Anim | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kvn | Border Kuna | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kwd | Kwaio | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kwf | Kwara'ae | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kwi | Awa-Cuaiquer | Barbacoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kwj | Kwanga | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyc | Kyaka | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyf | Kouya | Kru | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyg | Keyagana | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyq | Kenga | Central Sudanic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyz | Kayabí | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kze | Kosena | Bookkeeping | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kzj | Coastal Kadazan | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lac | Lacandon | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lao | Lao | Tai-Kadai | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| lat | Latin | Indo-European | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| lav | Latvian | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| lbb | Label | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lbk | Central Bontok | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lcm | Tungag | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| leu | Kara (Papua New Guinea) | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lex | Luang | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lfn | Lingua Franca Nova | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lgl | Wala | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lid | Nyindrou | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lif | Limbu | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lij | Ligurian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| lim | Limburgan | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| lin | Lingala | Atlantic-Congo | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| lit | Lithuanian | Indo-European | 0 | 0 | 0 | 6 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | -| llg | Lole | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lmo | Lombard | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| ltg | Latgalian | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| ltz | Luxembourgish | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| lua | Luba-Lulua | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| lug | Ganda | Atlantic-Congo | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| luo | Luo (Kenya and Tanzania) | Nilotic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| lus | Lushai | Sino-Tibetan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| lvs | Standard Latvian | Unclassified | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| lww | Lewo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| maa | San Jerónimo Tecóatl Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mad | Madurese | Austronesian | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| mag | Magahi | Indo-European | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| mai | Maithili | Indo-European | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| maj | Jalapa De Díaz Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mak | Makasar | Austronesian | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| mal | Malayalam | Dravidian | 0 | 0 | 0 | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 19 | -| mam | Mam | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| maq | Chiquihuitlán Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mar | Marathi | Indo-European | 0 | 0 | 0 | 9 | 6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 23 | -| mau | Huautla Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mav | Sateré-Mawé | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| max | North Moluccan Malay | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| maz | Central Mazahua | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbb | Western Bukidnon Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbc | Macushi | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbh | Mangseng | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbj | Nadëb | Naduhup | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbl | Maxakalí | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbs | Sarangani Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbt | Matigsalug Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mca | Maca | Mataguayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcb | Machiguenga | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcd | Sharanahua | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcf | Matsés | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mco | Coatlán Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcp | Makaa | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcq | Ese | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcr | Menya | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mdy | Male (Ethiopia) | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| med | Melpa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mee | Mengen | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mek | Mekeo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| meq | Merey | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| met | Mato | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| meu | Motu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mey | Hassaniyya | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mgc | Morokodo | Central Sudanic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mgh | Makhuwa-Meetto | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mgw | Matumbi | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mhl | Mauwake | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mhr | Eastern Mari | Uralic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mib | Atatláhuca Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mic | Mi'kmaq | Algic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mie | Ocotepec Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mig | San Miguel El Grande Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mih | Chayuco Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mil | Peñoles Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| min | Minangkabau | Austronesian | 0 | 0 | 0 | 3 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | -| mio | Pinotepa Nacional Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mir | Isthmus Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mit | Southern Puebla Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| miz | Coatzospan Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mjc | San Juan Colorado Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mkd | Macedonian | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| mkj | Mokilese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mkl | Mokole | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mkn | Kupang Malay | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mks | Silacayoapan Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mle | Manambu | Ndu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mlg | Malagasy | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mlh | Mape | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mlp | Bargam | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mlt | Maltese | Afro-Asiatic | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | -| mmo | Mangga Buang | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mmx | Madak | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mna | Mbula | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mni | Manipuri | Sino-Tibetan | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| mon | Mongolian | Unclassified | 0 | 0 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| mop | Mopán Maya | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mos | Mossi | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| mox | Molima | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mph | Maung | Iwaidjan Proper | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpj | Martu Wangka | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpm | Yosondúa Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpp | Migabac | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mps | Dadibi | Teberan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpt | Mian | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpx | Misima-Panaeati | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mqb | Mbuko | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mqj | Mamasa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mri | Maori | Austronesian | 0 | 1 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| msa | Malay (macrolanguage) | Unclassified | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| msb | Masbatenyo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| msc | Sankaran Maninka | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| msk | Mansaka | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| msm | Agusan Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| msy | Aruamu | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mti | Maiwa (Papua New Guinea) | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mto | Totontepec Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mui | Musi | Austronesian | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| mup | Malvi | Indo-European | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| mux | Bo-Ung | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| muy | Muyang | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mva | Manam | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mvn | Minaveha | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwc | Are | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwe | Mwera (Chimwera) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwf | Murrinh-Patha | Southern Daly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwp | Kala Lagaw Ya | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwr | Marwari | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mxb | Tezoatlán Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mxp | Tlahuitoltepec Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mxq | Juquila Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mxt | Jamiltepec Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mya | Burmese | Sino-Tibetan | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | -| myk | Mamara Senoufo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| myu | Mundurukú | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| myw | Muyuw | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| myy | Macuna | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mzz | Maiadomu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nab | Southern Nambikuára | Nambiquaran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| naf | Nabak | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nak | Nakanai | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nas | Naasioi | South Bougainville | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nbl | South Ndebele | Unclassified | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nbq | Nggem | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nca | Iyo | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nch | Central Huasteca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ncj | Northern Puebla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ncl | Michoacán Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ncu | Chumburung | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nde | North Ndebele | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ndg | Ndengereko | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ndj | Ndamba | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nds | Low German | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nep | Nepali (macrolanguage) | Unclassified | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| nfa | Dhao | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ngp | Ngulu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ngu | Guerrero Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhe | Eastern Huasteca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhg | Tetelcingo Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhi | Zacatlán-Ahuacatlán-Tepetzintla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nho | Takuu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhr | Naro | Khoe-Kwadi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhu | Noone | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhw | Western Huasteca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhy | Northern Oaxaca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nif | Nek | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nii | Nii | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nij | Ngaju | Austronesian | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| nin | Ninzo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nko | Nkonya | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nld | Dutch | Indo-European | 0 | 1 | 0 | 8 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 31 | 2 | 0 | 0 | 0 | 0 | 4 | 0 | 56 | -| nlg | Gela | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nna | Nyangumarta | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nno | Norwegian Nynorsk | Unclassified | 0 | 0 | 0 | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| nnq | Ngindo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| noa | Woun Meu | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nob | Norwegian Bokmål | Unclassified | 0 | 0 | 0 | 4 | 8 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | -| noe | Nimadi | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nop | Numanggang | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nor | Norwegian | Indo-European | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| not | Nomatsiguenga | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nou | Ewage-Notu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nov | Novial | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| npi | Nepali (individual language) | Indo-European | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| npl | Southeastern Puebla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nqo | N'Ko | Artificial Language | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| nsn | Nehan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nso | Pedi | Atlantic-Congo | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| nss | Nali | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ntj | Ngaanyatjarra | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ntp | Northern Tepehuan | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ntu | Natügu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nus | Nuer | Nilotic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| nuy | Nunggubuyu | Gunwinyguan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nvm | Namiae | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nwi | Southwest Tanna | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nya | Nyanja | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| nys | Nyungar | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nyu | Nyungwe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| obo | Obo Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| oci | Occitan (post 1500) | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| okv | Orokaiva | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| omw | South Tairora | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ong | Olo | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ons | Ono | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ood | Tohono O'odham | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| opm | Oksapmin | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ori | Oriya (macrolanguage) | Unclassified | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| orm | Oromo | Unclassified | 0 | 0 | 0 | 1 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| orv | Old Russian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ory | Odia | Indo-European | 0 | 0 | 0 | 5 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | -| ote | Mezquital Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| otm | Eastern Highland Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| otn | Tenango Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| otq | Querétaro Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ots | Estado de México Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pab | Parecís | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pad | Paumarí | Arawan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pag | Pangasinan | Austronesian | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| pah | Tenharim | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pam | Pampanga | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pan | Panjabi | Indo-European | 0 | 0 | 0 | 6 | 6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | -| pao | Northern Paiute | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pap | Papiamento | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| pbt | Southern Pashto | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| pcm | Nigerian Pidgin | Indo-European | 0 | 0 | 0 | 1 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| pes | Iranian Persian | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| pib | Yine | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pio | Piapoco | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pir | Piratapuyo | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| piu | Pintupi-Luritja | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pjt | Pitjantjatjara | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pls | San Marcos Tlacoyalco Popoloca | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| plt | Plateau Malagasy | Austronesian | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| plu | Palikúr | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pma | Paama | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pms | Piemontese | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| poe | San Juan Atzingo Popoloca | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| poh | Poqomchi' | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| poi | Highland Popoluca | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pol | Polish | Indo-European | 0 | 1 | 0 | 6 | 11 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | 0 | 20 | 4 | 0 | 0 | 0 | 0 | 2 | 0 | 53 | -| pon | Pohnpeian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| por | Portuguese | Indo-European | 0 | 1 | 0 | 6 | 9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 2 | 7 | 3 | 0 | 0 | 0 | 0 | 2 | 0 | 36 | -| poy | Pogolo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ppo | Folopa | Teberan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| prf | Paranan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pri | Paicî | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| prs | Dari | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ptp | Patep | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ptu | Bambam | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pus | Pushto | Unclassified | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| pwg | Gapapaiwa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qub | Huallaga Huánuco Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| quc | K'iche' | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| quf | Lambayeque Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| quh | South Bolivian Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qul | North Bolivian Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qup | Southern Pastaza Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| quy | Ayacucho Quechua | Quechuan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| quz | Cusco Quechua | Quechuan | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvc | Cajamarca Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qve | Eastern Apurímac Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvh | Huamalíes-Dos de Mayo Huánuco Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvm | Margos-Yarowilca-Lauricocha Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvn | North Junín Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvs | San Martín Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvw | Huaylla Wanca Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvz | Northern Pastaza Quichua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qwh | Huaylas Ancash Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qxh | Panao Huánuco Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qxn | Northern Conchucos Ancash Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qxo | Southern Conchucos Ancash Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rai | Ramoaaina | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| raj | Rajasthani | Unclassified | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| reg | Kara (Tanzania) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rej | Rejang | Austronesian | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| rgu | Ringgou | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rkb | Rikbaktsa | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rmc | Carpathian Romani | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rmy | Vlax Romani | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rom | Romany | Unclassified | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| ron | Romanian | Indo-European | 0 | 1 | 0 | 7 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | -| roo | Rotokas | North Bougainville | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rop | Kriol | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| row | Dela-Oenale | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rro | Waima | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ruf | Luguru | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rug | Roviana | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| run | Rundi | Atlantic-Congo | 0 | 0 | 0 | 1 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| rus | Russian | Indo-European | 0 | 2 | 0 | 7 | 18 | 6 | 0 | 1 | 0 | 0 | 0 | 0 | 2 | 4 | 2 | 18 | 4 | 0 | 0 | 0 | 0 | 2 | 0 | 66 | -| rwo | Rawa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sab | Buglere | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sag | Sango | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| sah | Yakut | Turkic | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| san | Sanskrit | Indo-European | 0 | 0 | 0 | 5 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| sat | Santali | Austroasiatic | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| sbe | Saliba | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sbk | Safwa | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sbs | Subiya | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| scn | Sicilian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| sco | Scots | Indo-European | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| seh | Sena | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sey | Secoya | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sgb | Mag-antsi Ayta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sgz | Sursurunga | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| shi | Tachelhit | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| shj | Shatt | Dajuic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| shn | Shan | Tai-Kadai | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| shp | Shipibo-Conibo | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sim | Mende (Papua New Guinea) | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sin | Sinhala | Indo-European | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| sja | Epena | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| slk | Slovak | Indo-European | 0 | 0 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | -| sll | Salt-Yui | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| slv | Slovenian | Indo-European | 0 | 0 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13 | -| smk | Bolinao | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| smo | Samoan | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| sna | Shona | Atlantic-Congo | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| snc | Sinaugoro | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| snd | Sindhi | Indo-European | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| snn | Siona | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| snp | Siane | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| snx | Sam | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sny | Saniyo-Hiyewe | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| som | Somali | Afro-Asiatic | 0 | 0 | 0 | 3 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | -| soq | Kanasi | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sot | Southern Sotho | Atlantic-Congo | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| soy | Miyobe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spa | Spanish | Indo-European | 0 | 2 | 0 | 6 | 13 | 4 | 0 | 4 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 16 | 4 | 0 | 0 | 0 | 0 | 4 | 0 | 60 | -| spl | Selepet | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spm | Akukem | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spp | Supyire Senoufo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sps | Saposa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spy | Sabaot | Nilotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sqi | Albanian | Unclassified | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| srd | Sardinian | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| sri | Siriano | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| srm | Saramaccan | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| srn | Sranan Tongo | Indo-European | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| srp | Serbian | Indo-European | 0 | 0 | 0 | 6 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| srq | Sirionó | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ssd | Siroi | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ssg | Seimat | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ssw | Swati | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| ssx | Samberigi | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| stp | Southeastern Tepehuan | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sua | Sulka | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sue | Suena | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sun | Sundanese | Austronesian | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | -| sus | Susu | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| suz | Sunwar | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| svk | Slovakian Sign Language | Sign Language | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| swa | Swahili (macrolanguage) | Atlantic-Congo | 0 | 1 | 0 | 1 | 7 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 17 | -| swe | Swedish | Indo-European | 0 | 1 | 0 | 6 | 9 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| swg | Swabian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| swh | Swahili (individual language) | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| swp | Suau | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sxb | Suba | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| szl | Silesian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| tac | Lowland Tarahumara | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tah | Tahitian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| taj | Eastern Tamang | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tam | Tamil | Dravidian | 0 | 0 | 0 | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | -| taq | Tamasheq | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| tat | Tatar | Turkic | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| tav | Tatuyo | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| taw | Tai | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbc | Takia | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbf | Mandara | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbg | North Tairora | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbo | Tawala | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbz | Ditammari | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tca | Ticuna | Ticuna-Yuri | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tcs | Torres Strait Creole | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tcz | Thado Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tdt | Tetun Dili | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tee | Huehuetla Tepehua | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tel | Telugu | Dravidian | 0 | 1 | 0 | 7 | 7 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | -| ter | Tereno | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tet | Tetum | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tew | Tewa (USA) | Kiowa-Tanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tfr | Teribe | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tgk | Tajik | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| tgl | Tagalog | Austronesian | 0 | 0 | 0 | 5 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | -| tgo | Sudest | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tgp | Tangoa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tha | Thai | Tai-Kadai | 0 | 1 | 0 | 6 | 8 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 27 | -| tif | Tifal | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tim | Timbe | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tir | Tigrinya | Afro-Asiatic | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| tiw | Tiwi | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tiy | Tiruray | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tke | Takwane | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tku | Upper Necaxa Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tlf | Telefol | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tmd | Haruai | Piawi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tna | Tacana | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tnc | Tanimuca-Retuarã | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tnk | Kwamera | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tnn | North Tanna | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tnp | Whitesands | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| toc | Coyutla Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tod | Toma | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tof | Gizrra | Eastern Trans-Fly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| toj | Tojolabal | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ton | Tonga (Tonga Islands) | Austronesian | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| too | Xicotepec De Juárez Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| top | Papantla Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tos | Highland Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tpa | Taupota | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tpi | Tok Pisin | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| tpt | Tlachichilco Tepehua | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tpz | Tinputz | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| trc | Copala Triqui | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tsn | Tswana | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| tso | Tsonga | Atlantic-Congo | 0 | 0 | 0 | 1 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| tsw | Tsishingini | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ttc | Tektiteko | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tte | Bwanabwana | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tuc | Mutu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tue | Tuyuca | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tuf | Central Tunebo | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tuk | Turkmen | Turkic | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| tum | Tumbuka | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| tuo | Tucano | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tur | Turkish | Turkic | 0 | 3 | 0 | 6 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 5 | 2 | 0 | 0 | 0 | 0 | 2 | 0 | 29 | -| tvk | Southeast Ambrym | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| twi | Twi | Unclassified | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| txq | Tii | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| txu | Kayapó | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tyv | Tuvinian | Turkic | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tzj | Tz'utujil | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tzl | Talossan | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tzm | Central Atlas Tamazight | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| tzo | Tzotzil | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ubr | Ubir | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ubu | Umbu-Ungu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| udu | Uduk | Koman | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| uig | Uighur | Turkic | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| ukr | Ukrainian | Indo-European | 0 | 1 | 0 | 6 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| uli | Ulithian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ulk | Meriam Mir | Eastern Trans-Fly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| umb | Umbundu | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| upv | Uripiv-Wala-Rano-Atchin | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ura | Urarina | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| urb | Urubú-Kaapor | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| urd | Urdu | Indo-European | 0 | 0 | 0 | 9 | 8 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | -| uri | Urim | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| urt | Urat | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| urw | Sop | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| usa | Usarufa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| usp | Uspanteco | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| uvh | Uri | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| uvl | Lote | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| uzb | Uzbek | Unclassified | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| uzn | Northern Uzbek | Turkic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| vec | Venetian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| ven | Venda | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| vid | Vidunda | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| vie | Vietnamese | Austroasiatic | 0 | 2 | 0 | 7 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | -| viv | Iduna | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| vmy | Ayautla Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| waj | Waffa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wal | Wolaytta | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wap | Wapishana | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| war | Waray (Philippines) | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| wat | Kaninuwa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wbi | Vwanji | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wbp | Warlpiri | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wed | Wedau | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wer | Weri | Kunimaipan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wim | Wik-Mungkan | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wiu | Wiru | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wiv | Vitu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wln | Walloon | Indo-European | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wmt | Walmajarri | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wmw | Mwani | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wnc | Wantoat | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wnu | Usan | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wol | Wolof | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| wos | Hanga Hundi | Ndu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wrk | Garrwa | Garrwan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wro | Worrorra | Worrorran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wrs | Waris | Border | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wsk | Waskia | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wuu | Wu Chinese | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wuv | Wuvulu-Aua | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xav | Xavánte | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xbi | Kombio | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xed | Hdi | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xho | Xhosa | Atlantic-Congo | 0 | 0 | 0 | 3 | 3 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| xla | Kamula | Kamula-Elevala | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xnn | Northern Kankanay | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xon | Konkomba | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xsi | Sio | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xtd | Diuxi-Tilantongo Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xtm | Magdalena Peñasco Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yaa | Yaminahua | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yad | Yagua | Peba-Yagua | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yal | Yalunka | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yap | Yapese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yaq | Yaqui | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yby | Yaweyuha | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ycn | Yucuna | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ydd | Eastern Yiddish | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| yid | Yiddish | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yka | Yakan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yle | Yele | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yml | Iamalele | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yon | Yongkom | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yor | Yoruba | Atlantic-Congo | 0 | 0 | 0 | 4 | 5 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 17 | -| yrb | Yareba | Yareban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yre | Yaouré | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yss | Yessan-Mayo | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yue | Yue Chinese | Sino-Tibetan | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| yuj | Karkar-Yuri | Pauwasi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yut | Yopno | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yuw | Yau (Morobe Province) | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yva | Yawa | Yawa-Saweru | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zaa | Sierra de Juárez Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zab | Western Tlacolula Valley Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zac | Ocotlán Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zad | Cajonos Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zai | Isthmus Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zaj | Zaramo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zam | Miahuatlán Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zao | Ozolotepec Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zap | Zapotec | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zar | Rincón Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zas | Santo Domingo Albarradas Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zat | Tabaa Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zav | Yatzachi Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zaw | Mitla Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zca | Coatecas Altas Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zga | Kinga | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zho | Chinese | Unclassified | 0 | 2 | 0 | 4 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 29 | -| zia | Zia | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ziw | Zigula | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zlm | Malay (individual language) | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zos | Francisco León Zoque | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpc | Choapan Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpl | Lachixío Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpm | Mixtepec Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpo | Amatlán Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpq | Zoogocho Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpu | Yalálag Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpv | Chichicapan Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpz | Texmelucan Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zsm | Standard Malay | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| zsr | Southern Rincon Zapotec | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ztq | Quioquitani-Quierí Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zty | Yatee Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zul | Zulu | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| zyp | Zyphe Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| Total | None | None | None | 0 | 55 | 49 | 1494 | 848 | 316 | 7 | 41 | 22 | 5 | 0 | 3 | 29 | 93 | 65 | 631 | 88 | 2 | 2 | 6 | 7 | 37 | 24 | +| ISO Code | Language | Family | Any2AnyMultiChoice | Any2AnyMultilingualRetrieval | Any2AnyRetrieval | BitextMining | Classification | Clustering | Compositionality | DocumentUnderstanding | ImageClassification | ImageClustering | ImageMultilabelClassification | InstructionRetrieval | MultilabelClassification | PairClassification | Regression | Reranking | Retrieval | STS | Speed | Summarization | VisionCentricQA | VisualSTS(eng) | VisualSTS(multi) | ZeroShotClassification | Sum | +|----------|----------|--------|--------------------|------------------------------|------------------|--------------|----------------|------------|------------------|-----------------------|---------------------|-----------------|-------------------------------|----------------------|--------------------------|--------------------|------------|-----------|-----------|-----|-------|---------------|-----------------|----------------|------------------|------------------------|-----| +| aai | Arifama-Miniafia | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aak | Ankave | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aau | Abau | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aaz | Amarasi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| abs | Ambonese Malay | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| abt | Ambulas | Ndu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| abx | Inabaknon | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aby | Aneme Wake | Yareban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ace | Achinese | Austronesian | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| acf | Saint Lucian Creole French | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| acm | Mesopotamian Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| acq | Ta'izzi-Adeni Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| acr | Achi | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| acu | Achuar-Shiwiar | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| adz | Adzera | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aeb | Tunisian Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| aer | Eastern Arrernte | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aey | Amele | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| afr | Afrikaans | Indo-European | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| agd | Agarabi | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agg | Angor | Senagi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agm | Angaataha | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agn | Agutaynen | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agr | Aguaruna | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agt | Central Cagayan Agta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agu | Aguacateco | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aia | Arosi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aii | Assyrian Neo-Aramaic | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ajp | South Levantine Arabic | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| aka | Akan | Atlantic-Congo | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ake | Akawaio | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| alp | Alune | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| alq | Algonquin | Algic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| als | Tosk Albanian | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| aly | Alyawarr | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ame | Yanesha' | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amf | Hamer-Banna | South Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amh | Amharic | Afro-Asiatic | 0 | 0 | 0 | 3 | 6 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | +| amk | Ambai | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amm | Ama (Papua New Guinea) | Left May | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amn | Amanab | Border | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amo | Amo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amp | Alamblak | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amr | Amarakaeri | Harakmbut | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amu | Guerrero Amuzgo | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amx | Anmatyerre | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ang | Old English (ca. 450-1100) | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| anh | Nend | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| anp | Angika | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| anv | Denya | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aoi | Anindilyakwa | Gunwinyguan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aoj | Mufian | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aom | Ömie | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aon | Bumbita Arapesh | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apb | Sa'a | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apc | Levantine Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ape | Bukiyip | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apn | Apinayé | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apr | Arop-Lokep | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apu | Apurinã | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apw | Western Apache | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apz | Safeyoka | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ara | Arabic | Unclassified | 0 | 2 | 0 | 4 | 12 | 0 | 0 | 7 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 2 | 11 | 2 | 0 | 0 | 0 | 0 | 2 | 0 | 44 | +| arb | Standard Arabic | Afro-Asiatic | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| are | Western Arrarnta | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| arl | Arabela | Zaparoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| arn | Mapudungun | Araucanian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| arp | Arapaho | Algic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| arq | Algerian Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ars | Najdi Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ary | Moroccan Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| arz | Egyptian Arabic | Afro-Asiatic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| asm | Assamese | Indo-European | 0 | 0 | 0 | 5 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | +| aso | Dano | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ast | Asturian | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ata | Pele-Ata | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| atb | Zaiwa | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| atd | Ata Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| atg | Ivbie North-Okpela-Arhe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| att | Pamplona Atta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| auc | Waorani | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aui | Anuki | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| auy | Awiyaana | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| avt | Au | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| awa | Awadhi | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| awb | Awa (Papua New Guinea) | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| awk | Awabakal | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| awx | Awara | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ayr | Central Aymara | Aymaran | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| azb | South Azerbaijani | Turkic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| aze | Azerbaijani | Unclassified | 0 | 0 | 0 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| azg | San Pedro Amuzgos Amuzgo | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| azj | North Azerbaijani | Turkic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| azz | Highland Puebla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bak | Bashkir | Turkic | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| bam | Bambara | Mande | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| ban | Balinese | Austronesian | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| bao | Waimaha | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bba | Baatonum | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bbb | Barai | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bbc | Batak Toba | Austronesian | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| bbr | Girawa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bch | Bariai | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bco | Kaluli | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bdd | Bunama | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bea | Beaver | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bef | Benabena | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bel | Belarusian | Indo-European | 0 | 0 | 0 | 4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| bem | Bemba (Zambia) | Atlantic-Congo | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ben | Bengali | Indo-European | 0 | 1 | 0 | 9 | 9 | 2 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 35 | +| beo | Beami | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ber | Berber (Other) | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| beu | Blagar | Timor-Alor-Pantar | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bew | Betawi | Austronesian | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| bgc | Haryanvi | Indo-European | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| bgs | Tagabawa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bgt | Bughotu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bhb | Bhili | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bhd | Bhadrawahi | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bhg | Binandere | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bhl | Bimin | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bho | Bhojpuri | Indo-European | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| bhp | Bima | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| big | Biangai | Kunimaipan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjj | Kanauji | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjk | Barok | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjn | Banjar | Austronesian | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| bjp | Fanamaket | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjr | Binumarien | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjv | Bedjond | Central Sudanic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjz | Baruga | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bkd | Binukid | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bki | Baki | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bkq | Bakairí | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bkx | Baikeno | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| blw | Balangao | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| blz | Balantak | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bmh | Kein | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bmk | Ghayavi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bmr | Muinane | Boran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bmu | Somba-Siawari | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bnp | Bola | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bns | Bundeli | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| boa | Bora | Boran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bod | Tibetan | Sino-Tibetan | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| boj | Anjam | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bon | Bine | Eastern Trans-Fly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bos | Bosnian | Indo-European | 0 | 0 | 0 | 3 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| box | Buamu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| boy | Bodo (Central African Republic) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bpr | Koronadal Blaan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bps | Sarangani Blaan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bqc | Boko (Benin) | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bqp | Busa | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bra | Braj | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bre | Breton | Indo-European | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| brx | Bodo (India) | Sino-Tibetan | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| bsj | Bangwinji | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bsn | Barasana-Eduria | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bsp | Baga Sitemu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bss | Akoose | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bug | Buginese | Austronesian | 0 | 0 | 0 | 2 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| buk | Bugawac | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bul | Bulgarian | Indo-European | 0 | 1 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 17 | +| bus | Bokobaru | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bvd | Baeggu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bvr | Burarra | Maningrida | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bxh | Buhutu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| byr | Baruya | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| byx | Qaqet | Baining | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bzd | Bribri | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bzh | Mapos Buang | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bzj | Belize Kriol English | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| caa | Chortí | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cab | Garifuna | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cac | Chuj | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| caf | Southern Carrier | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cak | Kaqchikel | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cao | Chácobo | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cap | Chipaya | Uru-Chipaya | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| car | Galibi Carib | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cat | Catalan | Indo-European | 0 | 0 | 0 | 5 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | +| cav | Cavineña | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cax | Chiquitano | Chiquitano | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbc | Carapana | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbi | Chachi | Barbacoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbk | Chavacano | Indo-European | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| cbr | Cashibo-Cacataibo | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbs | Cashinahua | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbt | Chayahuita | Cahuapanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbu | Candoshi-Shapra | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbv | Cacua | Kakua-Nukak | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cco | Comaltepec Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ceb | Cebuano | Austronesian | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| cek | Eastern Khumi Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ces | Czech | Indo-European | 0 | 1 | 0 | 6 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | +| cgc | Kagayanen | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cha | Chamorro | Austronesian | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| chd | Highland Oaxaca Chontal | Tequistlatecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chf | Tabasco Chontal | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chk | Chuukese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chq | Quiotepec Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chv | Chuvash | Turkic | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chz | Ozumacín Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cjk | Chokwe | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| cjo | Ashéninka Pajonal | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cjv | Chuave | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ckb | Central Kurdish | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| cle | Lealao Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| clu | Caluyanun | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cme | Cerma | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cmn | Mandarin Chinese | Sino-Tibetan | 0 | 0 | 0 | 4 | 10 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 4 | 10 | 9 | 0 | 0 | 0 | 0 | 2 | 0 | 46 | +| cmo | Central Mnong | Austroasiatic | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| cni | Asháninka | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cnl | Lalana Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cnt | Tepetotutla Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| code | unknown | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 44 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 48 | +| cof | Colorado | Barbacoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| con | Cofán | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cop | Coptic | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cor | Cornish | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cot | Caquinte | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpa | Palantla Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpb | Ucayali-Yurúa Ashéninka | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpc | Ajyíninka Apurucayali | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpu | Pichis Ashéninka | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpy | South Ucayali Ashéninka | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| crh | Crimean Tatar | Turkic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| crn | El Nayar Cora | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| crx | Carrier | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| csb | Kashubian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cso | Sochiapam Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| csy | Siyin Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cta | Tataltepec Chatino | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cth | Thaiphum Chin | Bookkeeping | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ctp | Western Highland Chatino | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ctu | Chol | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cub | Cubeo | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cuc | Usila Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cui | Cuiba | Guahiboan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cuk | San Blas Kuna | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cut | Teutila Cuicatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cux | Tepeuxila Cuicatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cwe | Kwere | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cya | Nopala Chatino | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cym | Welsh | Indo-European | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| daa | Dangaléat | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dad | Marik | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dah | Gwahatike | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dan | Danish | Indo-European | 0 | 2 | 0 | 8 | 11 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 33 | +| ded | Dedua | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| deu | German | Indo-European | 0 | 2 | 0 | 8 | 14 | 7 | 0 | 9 | 0 | 0 | 0 | 0 | 1 | 7 | 0 | 3 | 21 | 4 | 0 | 0 | 0 | 0 | 4 | 0 | 80 | +| dgc | Casiguran Dumagat Agta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dgr | Dogrib | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dgz | Daga | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dhg | Dhangu-Djangu | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dif | Dieri | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dik | Southwestern Dinka | Nilotic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| div | Dhivehi | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dji | Djinang | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| djk | Eastern Maroon Creole | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| djr | Djambarrpuyngu | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dob | Dobu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| doi | Dogri (macrolanguage) | Unclassified | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| dop | Lukpa | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dov | Dombe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dsb | Lower Sorbian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dtp | Kadazan Dusun | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dwr | Dawro | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dww | Dawawa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dwy | Dhuwaya | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dyu | Dyula | Mande | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| dza | Tunzu | Atlantic-Congo | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dzo | Dzongkha | Sino-Tibetan | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ebk | Eastern Bontok | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| eko | Koti | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ell | Modern Greek (1453-) | Indo-European | 0 | 2 | 0 | 5 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | +| emi | Mussau-Emira | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| emp | Northern Emberá | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| eng | English | Indo-European | 0 | 3 | 49 | 21 | 165 | 21 | 7 | 44 | 22 | 5 | 0 | 3 | 1 | 13 | 2 | 10 | 137 | 13 | 2 | 1 | 6 | 7 | 3 | 24 | 559 | +| enq | Enga | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| epo | Esperanto | Artificial Language | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| eri | Ogea | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ese | Ese Ejja | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| esk | Northwest Alaska Inupiatun | Eskimo-Aleut | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| est | Estonian | Uralic | 0 | 1 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| etr | Edolo | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| eus | Basque | Unclassified | 0 | 0 | 0 | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| ewe | Ewe | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| faa | Fasu | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fai | Faiwol | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fao | Faroese | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| far | Fataleka | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fas | Persian | Indo-European | 0 | 1 | 0 | 6 | 28 | 5 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 | 0 | 2 | 41 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 95 | +| ffm | Maasina Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fij | Fijian | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| fil | Filipino | Austronesian | 0 | 1 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| fin | Finnish | Uralic | 0 | 1 | 0 | 5 | 6 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 2 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | +| fon | Fon | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| for | Fore | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fra | French | Indo-European | 0 | 1 | 0 | 9 | 13 | 8 | 0 | 9 | 0 | 0 | 0 | 0 | 1 | 6 | 0 | 4 | 18 | 4 | 0 | 1 | 0 | 0 | 4 | 0 | 78 | +| fry | Western Frisian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fuc | Pulaar | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fue | Borgu Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fuf | Pular | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fuh | Western Niger Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fur | Friulian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| fuv | Nigerian Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| gah | Alekano | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gai | Borei | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gam | Kandawo | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gaw | Nobonob | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gaz | West Central Oromo | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| gbm | Garhwali | Indo-European | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| gdn | Umanakaina | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gdr | Wipi | Eastern Trans-Fly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| geb | Kire | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gfk | Patpatar | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ghs | Guhu-Samane | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gla | Scottish Gaelic | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| gle | Irish | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| glg | Galician | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| glk | Gilaki | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| glv | Manx | Indo-European | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gmv | Gamo | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gng | Ngangam | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gnn | Gumatj | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gnw | Western Bolivian Guaraní | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gof | Gofa | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gom | Goan Konkani | Indo-European | 0 | 0 | 0 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| grc | Ancient Greek (to 1453) | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| grn | Guarani | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| gsw | Swiss German | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gub | Guajajára | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| guh | Guahibo | Guahiboan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gui | Eastern Bolivian Guaraní | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| guj | Gujarati | Indo-European | 0 | 0 | 0 | 6 | 6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | +| gul | Sea Island Creole English | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gum | Guambiano | Barbacoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gun | Mbyá Guaraní | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| guo | Guayabero | Guahiboan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gup | Gunwinggu | Gunwinyguan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gux | Gourmanchéma | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gvc | Guanano | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gvf | Golin | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gvn | Kuku-Yalanji | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gvs | Gumawana | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gwi | Gwichʼin | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gym | Ngäbere | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gyr | Guarayu | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hat | Haitian | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| hau | Hausa | Afro-Asiatic | 0 | 0 | 0 | 4 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | +| haw | Hawaiian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hbo | Ancient Hebrew | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hch | Huichol | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| heb | Hebrew | Afro-Asiatic | 0 | 1 | 0 | 6 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| heg | Helong | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hin | Hindi | Indo-European | 0 | 1 | 0 | 11 | 12 | 2 | 0 | 6 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 2 | 11 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 49 | +| hix | Hixkaryána | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hla | Halia | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hlt | Matu Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hmn | Hmong | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hmo | Hiri Motu | Pidgin | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hne | Chhattisgarhi | Indo-European | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| hns | Caribbean Hindustani | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hop | Hopi | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hot | Hote | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hrv | Croatian | Indo-European | 0 | 1 | 0 | 6 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | +| hsb | Upper Sorbian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hto | Minica Huitoto | Huitotoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hub | Huambisa | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hui | Huli | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hun | Hungarian | Uralic | 0 | 1 | 0 | 7 | 3 | 1 | 0 | 4 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | +| hus | Huastec | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| huu | Murui Huitoto | Huitotoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| huv | San Mateo Del Mar Huave | Huavean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hvn | Sabu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hye | Armenian | Indo-European | 0 | 0 | 0 | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | +| ian | Iatmul | Ndu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ibo | Igbo | Atlantic-Congo | 0 | 0 | 0 | 3 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| ido | Ido | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ign | Ignaciano | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ikk | Ika | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ikw | Ikwere | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ile | Interlingue | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ilo | Iloko | Austronesian | 0 | 0 | 0 | 2 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| imo | Imbongu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ina | Interlingua (International Auxiliary Language Association) | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| inb | Inga | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ind | Indonesian | Austronesian | 0 | 3 | 0 | 8 | 7 | 1 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 30 | +| ino | Inoke-Yate | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| iou | Tuma-Irumu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ipi | Ipili | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| isl | Icelandic | Indo-European | 0 | 0 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| isn | Isanzu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ita | Italian | Indo-European | 0 | 1 | 0 | 7 | 11 | 1 | 0 | 3 | 0 | 0 | 0 | 0 | 2 | 3 | 0 | 2 | 8 | 3 | 0 | 0 | 0 | 0 | 4 | 0 | 45 | +| iws | Sepik Iwam | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ixl | Ixil | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jac | Popti' | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jae | Yabem | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jao | Yanyuwa | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jav | Javanese | Austronesian | 0 | 0 | 0 | 4 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13 | +| jic | Tol | Jicaquean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jid | Bu (Kaduna State) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jiv | Shuar | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jni | Janji | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jpn | Japanese | Japonic | 0 | 3 | 0 | 7 | 9 | 3 | 0 | 8 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 5 | 21 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 60 | +| jvn | Caribbean Javanese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kab | Kabyle | Afro-Asiatic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| kac | Kachin | Sino-Tibetan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| kam | Kamba (Kenya) | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kan | Kannada | Dravidian | 0 | 0 | 0 | 6 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 19 | +| kaq | Capanahua | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kas | Kashmiri | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| kat | Georgian | Kartvelian | 0 | 0 | 0 | 6 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13 | +| kaz | Kazakh | Turkic | 0 | 0 | 0 | 5 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | +| kbc | Kadiwéu | Guaicuruan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kbh | Camsá | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kbm | Iwal | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kbp | Kabiyè | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kbq | Kamano | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kdc | Kutu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kde | Makonde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kdl | Tsikimba | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kea | Kabuverdianu | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| kek | Kekchí | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ken | Kenyang | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kew | West Kewa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kfg | Kudiya | Dravidian | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kfy | Kumaoni | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kgf | Kube | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kgk | Kaiwá | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kgp | Kaingang | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| khk | Halh Mongolian | Mongolic-Khitan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| khm | Khmer | Austroasiatic | 0 | 0 | 0 | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | +| khs | Kasua | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| khz | Keapara | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kik | Kikuyu | Atlantic-Congo | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| kin | Kinyarwanda | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| kir | Kirghiz | Turkic | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| kiw | Northeast Kiwai | Kiwaian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kiz | Kisi | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kje | Kisar | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kjs | East Kewa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kkc | Odoodee | East Strickland | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kkl | Kosarek Yale | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| klt | Nukna | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| klv | Maskelynes | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmb | Kimbundu | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kmg | Kâte | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmh | Kalam | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmk | Limos Kalinga | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmo | Kwoma | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmr | Northern Kurdish | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| kms | Kamasau | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmu | Kanite | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| knc | Central Kanuri | Saharan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kne | Kankanaey | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| knf | Mankanya | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| knj | Western Kanjobal | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| knv | Tabo | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kon | Kongo | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kor | Korean | Koreanic | 0 | 2 | 0 | 6 | 8 | 3 | 0 | 3 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 1 | 11 | 3 | 0 | 0 | 0 | 0 | 2 | 0 | 43 | +| kos | Kosraean | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpf | Komba | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpg | Kapingamarangi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpj | Karajá | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpr | Korafe-Yegha | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpw | Kobon | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpx | Mountain Koiali | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kqa | Mum | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kqc | Doromu-Koki | Manubaran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kqf | Kakabai | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kql | Kyenele | Yuat | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kqw | Kandas | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| krc | Karachay-Balkar | Turkic | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ksd | Kuanua | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ksj | Uare | Kwalean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ksr | Borong | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ktm | Kurti | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kto | Kuot | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kud | 'Auhelawa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kue | Kuman (Papua New Guinea) | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kup | Kunimaipa | Kunimaipan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kur | Kurdish | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kvg | Kuni-Boazi | Anim | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kvn | Border Kuna | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kwd | Kwaio | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kwf | Kwara'ae | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kwi | Awa-Cuaiquer | Barbacoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kwj | Kwanga | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyc | Kyaka | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyf | Kouya | Kru | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyg | Keyagana | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyq | Kenga | Central Sudanic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyz | Kayabí | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kze | Kosena | Bookkeeping | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kzj | Coastal Kadazan | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lac | Lacandon | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lao | Lao | Tai-Kadai | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| lat | Latin | Indo-European | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| lav | Latvian | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| lbb | Label | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lbk | Central Bontok | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lcm | Tungag | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| leu | Kara (Papua New Guinea) | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lex | Luang | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lfn | Lingua Franca Nova | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lgl | Wala | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lid | Nyindrou | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lif | Limbu | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lij | Ligurian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| lim | Limburgan | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| lin | Lingala | Atlantic-Congo | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| lit | Lithuanian | Indo-European | 0 | 0 | 0 | 6 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | +| llg | Lole | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lmo | Lombard | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| ltg | Latgalian | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| ltz | Luxembourgish | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| lua | Luba-Lulua | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| lug | Ganda | Atlantic-Congo | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| luo | Luo (Kenya and Tanzania) | Nilotic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| lus | Lushai | Sino-Tibetan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| lvs | Standard Latvian | Unclassified | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| lww | Lewo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| maa | San Jerónimo Tecóatl Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mad | Madurese | Austronesian | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| mag | Magahi | Indo-European | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| mai | Maithili | Indo-European | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| maj | Jalapa De Díaz Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mak | Makasar | Austronesian | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| mal | Malayalam | Dravidian | 0 | 0 | 0 | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 19 | +| mam | Mam | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| maq | Chiquihuitlán Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mar | Marathi | Indo-European | 0 | 0 | 0 | 9 | 6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 23 | +| mau | Huautla Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mav | Sateré-Mawé | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| max | North Moluccan Malay | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| maz | Central Mazahua | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbb | Western Bukidnon Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbc | Macushi | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbh | Mangseng | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbj | Nadëb | Naduhup | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbl | Maxakalí | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbs | Sarangani Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbt | Matigsalug Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mca | Maca | Mataguayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcb | Machiguenga | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcd | Sharanahua | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcf | Matsés | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mco | Coatlán Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcp | Makaa | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcq | Ese | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcr | Menya | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mdy | Male (Ethiopia) | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| med | Melpa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mee | Mengen | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mek | Mekeo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| meq | Merey | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| met | Mato | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| meu | Motu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mey | Hassaniyya | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mgc | Morokodo | Central Sudanic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mgh | Makhuwa-Meetto | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mgw | Matumbi | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mhl | Mauwake | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mhr | Eastern Mari | Uralic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mib | Atatláhuca Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mic | Mi'kmaq | Algic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mie | Ocotepec Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mig | San Miguel El Grande Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mih | Chayuco Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mil | Peñoles Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| min | Minangkabau | Austronesian | 0 | 0 | 0 | 3 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | +| mio | Pinotepa Nacional Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mir | Isthmus Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mit | Southern Puebla Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| miz | Coatzospan Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mjc | San Juan Colorado Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mkd | Macedonian | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| mkj | Mokilese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mkl | Mokole | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mkn | Kupang Malay | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mks | Silacayoapan Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mle | Manambu | Ndu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mlg | Malagasy | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mlh | Mape | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mlp | Bargam | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mlt | Maltese | Afro-Asiatic | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | +| mmo | Mangga Buang | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mmx | Madak | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mna | Mbula | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mni | Manipuri | Sino-Tibetan | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| mon | Mongolian | Unclassified | 0 | 0 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| mop | Mopán Maya | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mos | Mossi | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| mox | Molima | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mph | Maung | Iwaidjan Proper | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpj | Martu Wangka | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpm | Yosondúa Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpp | Migabac | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mps | Dadibi | Teberan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpt | Mian | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpx | Misima-Panaeati | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mqb | Mbuko | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mqj | Mamasa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mri | Maori | Austronesian | 0 | 1 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| msa | Malay (macrolanguage) | Unclassified | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| msb | Masbatenyo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| msc | Sankaran Maninka | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| msk | Mansaka | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| msm | Agusan Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| msy | Aruamu | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mti | Maiwa (Papua New Guinea) | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mto | Totontepec Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mui | Musi | Austronesian | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| mup | Malvi | Indo-European | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| mux | Bo-Ung | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| muy | Muyang | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mva | Manam | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mvn | Minaveha | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwc | Are | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwe | Mwera (Chimwera) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwf | Murrinh-Patha | Southern Daly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwp | Kala Lagaw Ya | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwr | Marwari | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mxb | Tezoatlán Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mxp | Tlahuitoltepec Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mxq | Juquila Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mxt | Jamiltepec Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mya | Burmese | Sino-Tibetan | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| myk | Mamara Senoufo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| myu | Mundurukú | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| myw | Muyuw | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| myy | Macuna | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mzz | Maiadomu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nab | Southern Nambikuára | Nambiquaran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| naf | Nabak | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nak | Nakanai | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nas | Naasioi | South Bougainville | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nbl | South Ndebele | Unclassified | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nbq | Nggem | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nca | Iyo | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nch | Central Huasteca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ncj | Northern Puebla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ncl | Michoacán Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ncu | Chumburung | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nde | North Ndebele | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ndg | Ndengereko | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ndj | Ndamba | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nds | Low German | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nep | Nepali (macrolanguage) | Unclassified | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| nfa | Dhao | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ngp | Ngulu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ngu | Guerrero Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhe | Eastern Huasteca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhg | Tetelcingo Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhi | Zacatlán-Ahuacatlán-Tepetzintla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nho | Takuu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhr | Naro | Khoe-Kwadi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhu | Noone | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhw | Western Huasteca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhy | Northern Oaxaca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nif | Nek | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nii | Nii | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nij | Ngaju | Austronesian | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| nin | Ninzo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nko | Nkonya | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nld | Dutch | Indo-European | 0 | 1 | 0 | 8 | 6 | 1 | 0 | 3 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 31 | 2 | 0 | 0 | 0 | 0 | 4 | 0 | 59 | +| nlg | Gela | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nna | Nyangumarta | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nno | Norwegian Nynorsk | Unclassified | 0 | 0 | 0 | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| nnq | Ngindo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| noa | Woun Meu | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nob | Norwegian Bokmål | Unclassified | 0 | 0 | 0 | 4 | 8 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | +| noe | Nimadi | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nop | Numanggang | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nor | Norwegian | Indo-European | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| not | Nomatsiguenga | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nou | Ewage-Notu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nov | Novial | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| npi | Nepali (individual language) | Indo-European | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| npl | Southeastern Puebla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nqo | N'Ko | Artificial Language | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| nsn | Nehan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nso | Pedi | Atlantic-Congo | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| nss | Nali | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ntj | Ngaanyatjarra | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ntp | Northern Tepehuan | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ntu | Natügu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nus | Nuer | Nilotic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| nuy | Nunggubuyu | Gunwinyguan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nvm | Namiae | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nwi | Southwest Tanna | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nya | Nyanja | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| nys | Nyungar | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nyu | Nyungwe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| obo | Obo Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| oci | Occitan (post 1500) | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| okv | Orokaiva | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| omw | South Tairora | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ong | Olo | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ons | Ono | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ood | Tohono O'odham | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| opm | Oksapmin | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ori | Oriya (macrolanguage) | Unclassified | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| orm | Oromo | Unclassified | 0 | 0 | 0 | 1 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| orv | Old Russian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ory | Odia | Indo-European | 0 | 0 | 0 | 5 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | +| ote | Mezquital Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| otm | Eastern Highland Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| otn | Tenango Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| otq | Querétaro Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ots | Estado de México Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pab | Parecís | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pad | Paumarí | Arawan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pag | Pangasinan | Austronesian | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| pah | Tenharim | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pam | Pampanga | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pan | Panjabi | Indo-European | 0 | 0 | 0 | 6 | 6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | +| pao | Northern Paiute | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pap | Papiamento | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| pbt | Southern Pashto | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| pcm | Nigerian Pidgin | Indo-European | 0 | 0 | 0 | 1 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| pes | Iranian Persian | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| pib | Yine | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pio | Piapoco | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pir | Piratapuyo | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| piu | Pintupi-Luritja | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pjt | Pitjantjatjara | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pls | San Marcos Tlacoyalco Popoloca | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| plt | Plateau Malagasy | Austronesian | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| plu | Palikúr | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pma | Paama | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pms | Piemontese | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| poe | San Juan Atzingo Popoloca | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| poh | Poqomchi' | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| poi | Highland Popoluca | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pol | Polish | Indo-European | 0 | 1 | 0 | 6 | 11 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | 0 | 0 | 20 | 4 | 0 | 0 | 0 | 0 | 2 | 0 | 53 | +| pon | Pohnpeian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| por | Portuguese | Indo-European | 0 | 1 | 0 | 6 | 9 | 1 | 0 | 2 | 0 | 0 | 0 | 0 | 2 | 3 | 0 | 2 | 7 | 3 | 0 | 0 | 0 | 0 | 2 | 0 | 38 | +| poy | Pogolo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ppo | Folopa | Teberan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| prf | Paranan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pri | Paicî | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| prs | Dari | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ptp | Patep | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ptu | Bambam | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pus | Pushto | Unclassified | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| pwg | Gapapaiwa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qub | Huallaga Huánuco Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| quc | K'iche' | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| quf | Lambayeque Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| quh | South Bolivian Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qul | North Bolivian Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qup | Southern Pastaza Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| quy | Ayacucho Quechua | Quechuan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| quz | Cusco Quechua | Quechuan | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvc | Cajamarca Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qve | Eastern Apurímac Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvh | Huamalíes-Dos de Mayo Huánuco Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvm | Margos-Yarowilca-Lauricocha Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvn | North Junín Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvs | San Martín Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvw | Huaylla Wanca Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvz | Northern Pastaza Quichua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qwh | Huaylas Ancash Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qxh | Panao Huánuco Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qxn | Northern Conchucos Ancash Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qxo | Southern Conchucos Ancash Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rai | Ramoaaina | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| raj | Rajasthani | Unclassified | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| reg | Kara (Tanzania) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rej | Rejang | Austronesian | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| rgu | Ringgou | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rkb | Rikbaktsa | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rmc | Carpathian Romani | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rmy | Vlax Romani | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rom | Romany | Unclassified | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| ron | Romanian | Indo-European | 0 | 1 | 0 | 7 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | +| roo | Rotokas | North Bougainville | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rop | Kriol | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| row | Dela-Oenale | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rro | Waima | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ruf | Luguru | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rug | Roviana | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| run | Rundi | Atlantic-Congo | 0 | 0 | 0 | 1 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| rus | Russian | Indo-European | 0 | 2 | 0 | 8 | 20 | 6 | 0 | 6 | 0 | 0 | 0 | 0 | 2 | 4 | 2 | 2 | 20 | 4 | 0 | 0 | 0 | 0 | 2 | 0 | 78 | +| rwo | Rawa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sab | Buglere | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sag | Sango | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| sah | Yakut | Turkic | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| san | Sanskrit | Indo-European | 0 | 0 | 0 | 5 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| sat | Santali | Austroasiatic | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| sbe | Saliba | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sbk | Safwa | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sbs | Subiya | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| scn | Sicilian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| sco | Scots | Indo-European | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| seh | Sena | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sey | Secoya | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sgb | Mag-antsi Ayta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sgz | Sursurunga | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| shi | Tachelhit | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| shj | Shatt | Dajuic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| shn | Shan | Tai-Kadai | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| shp | Shipibo-Conibo | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sim | Mende (Papua New Guinea) | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sin | Sinhala | Indo-European | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| sja | Epena | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| slk | Slovak | Indo-European | 0 | 0 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | +| sll | Salt-Yui | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| slv | Slovenian | Indo-European | 0 | 0 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13 | +| smk | Bolinao | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| smo | Samoan | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| sna | Shona | Atlantic-Congo | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| snc | Sinaugoro | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| snd | Sindhi | Indo-European | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| snn | Siona | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| snp | Siane | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| snx | Sam | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sny | Saniyo-Hiyewe | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| som | Somali | Afro-Asiatic | 0 | 0 | 0 | 3 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | +| soq | Kanasi | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sot | Southern Sotho | Atlantic-Congo | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| soy | Miyobe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| spa | Spanish | Indo-European | 0 | 2 | 0 | 6 | 13 | 4 | 0 | 9 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 3 | 16 | 4 | 0 | 0 | 0 | 0 | 4 | 0 | 65 | +| spl | Selepet | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| spm | Akukem | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| spp | Supyire Senoufo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sps | Saposa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| spy | Sabaot | Nilotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sqi | Albanian | Unclassified | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| srd | Sardinian | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| sri | Siriano | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| srm | Saramaccan | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| srn | Sranan Tongo | Indo-European | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| srp | Serbian | Indo-European | 0 | 0 | 0 | 6 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| srq | Sirionó | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ssd | Siroi | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ssg | Seimat | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ssw | Swati | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| ssx | Samberigi | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| stp | Southeastern Tepehuan | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sua | Sulka | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sue | Suena | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sun | Sundanese | Austronesian | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | +| sus | Susu | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| suz | Sunwar | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| svk | Slovakian Sign Language | Sign Language | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| swa | Swahili (macrolanguage) | Atlantic-Congo | 0 | 1 | 0 | 1 | 7 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 17 | +| swe | Swedish | Indo-European | 0 | 1 | 0 | 6 | 9 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | +| swg | Swabian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| swh | Swahili (individual language) | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| swp | Suau | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sxb | Suba | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| szl | Silesian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| tac | Lowland Tarahumara | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tah | Tahitian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| taj | Eastern Tamang | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tam | Tamil | Dravidian | 0 | 0 | 0 | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | +| taq | Tamasheq | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| tat | Tatar | Turkic | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| tav | Tatuyo | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| taw | Tai | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbc | Takia | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbf | Mandara | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbg | North Tairora | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbo | Tawala | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbz | Ditammari | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tca | Ticuna | Ticuna-Yuri | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tcs | Torres Strait Creole | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tcz | Thado Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tdt | Tetun Dili | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tee | Huehuetla Tepehua | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tel | Telugu | Dravidian | 0 | 1 | 0 | 7 | 7 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | +| ter | Tereno | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tet | Tetum | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tew | Tewa (USA) | Kiowa-Tanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tfr | Teribe | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tgk | Tajik | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| tgl | Tagalog | Austronesian | 0 | 0 | 0 | 5 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | +| tgo | Sudest | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tgp | Tangoa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tha | Thai | Tai-Kadai | 0 | 1 | 0 | 6 | 8 | 1 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 29 | +| tif | Tifal | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tim | Timbe | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tir | Tigrinya | Afro-Asiatic | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| tiw | Tiwi | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tiy | Tiruray | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tke | Takwane | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tku | Upper Necaxa Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tlf | Telefol | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tmd | Haruai | Piawi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tna | Tacana | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tnc | Tanimuca-Retuarã | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tnk | Kwamera | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tnn | North Tanna | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tnp | Whitesands | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| toc | Coyutla Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tod | Toma | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tof | Gizrra | Eastern Trans-Fly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| toj | Tojolabal | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ton | Tonga (Tonga Islands) | Austronesian | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| too | Xicotepec De Juárez Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| top | Papantla Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tos | Highland Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tpa | Taupota | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tpi | Tok Pisin | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| tpt | Tlachichilco Tepehua | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tpz | Tinputz | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| trc | Copala Triqui | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tsn | Tswana | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| tso | Tsonga | Atlantic-Congo | 0 | 0 | 0 | 1 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| tsw | Tsishingini | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ttc | Tektiteko | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tte | Bwanabwana | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tuc | Mutu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tue | Tuyuca | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tuf | Central Tunebo | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tuk | Turkmen | Turkic | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| tum | Tumbuka | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| tuo | Tucano | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tur | Turkish | Turkic | 0 | 3 | 0 | 6 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 5 | 2 | 0 | 0 | 0 | 0 | 2 | 0 | 29 | +| tvk | Southeast Ambrym | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| twi | Twi | Unclassified | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| txq | Tii | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| txu | Kayapó | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tyv | Tuvinian | Turkic | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tzj | Tz'utujil | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tzl | Talossan | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tzm | Central Atlas Tamazight | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| tzo | Tzotzil | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ubr | Ubir | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ubu | Umbu-Ungu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| udu | Uduk | Koman | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| uig | Uighur | Turkic | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| ukr | Ukrainian | Indo-European | 0 | 1 | 0 | 6 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| uli | Ulithian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ulk | Meriam Mir | Eastern Trans-Fly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| umb | Umbundu | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| upv | Uripiv-Wala-Rano-Atchin | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ura | Urarina | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| urb | Urubú-Kaapor | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| urd | Urdu | Indo-European | 0 | 0 | 0 | 9 | 8 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 23 | +| uri | Urim | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| urt | Urat | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| urw | Sop | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| usa | Usarufa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| usp | Uspanteco | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| uvh | Uri | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| uvl | Lote | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| uzb | Uzbek | Unclassified | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| uzn | Northern Uzbek | Turkic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| vec | Venetian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| ven | Venda | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| vid | Vidunda | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| vie | Vietnamese | Austroasiatic | 0 | 2 | 0 | 7 | 18 | 6 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 4 | 0 | 3 | 34 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 79 | +| viv | Iduna | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| vmy | Ayautla Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| waj | Waffa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wal | Wolaytta | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wap | Wapishana | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| war | Waray (Philippines) | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| wat | Kaninuwa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wbi | Vwanji | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wbp | Warlpiri | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wed | Wedau | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wer | Weri | Kunimaipan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wim | Wik-Mungkan | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wiu | Wiru | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wiv | Vitu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wln | Walloon | Indo-European | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wmt | Walmajarri | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wmw | Mwani | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wnc | Wantoat | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wnu | Usan | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wol | Wolof | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| wos | Hanga Hundi | Ndu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wrk | Garrwa | Garrwan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wro | Worrorra | Worrorran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wrs | Waris | Border | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wsk | Waskia | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wuu | Wu Chinese | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wuv | Wuvulu-Aua | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xav | Xavánte | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xbi | Kombio | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xed | Hdi | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xho | Xhosa | Atlantic-Congo | 0 | 0 | 0 | 3 | 3 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| xla | Kamula | Kamula-Elevala | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xnn | Northern Kankanay | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xon | Konkomba | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xsi | Sio | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xtd | Diuxi-Tilantongo Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xtm | Magdalena Peñasco Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yaa | Yaminahua | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yad | Yagua | Peba-Yagua | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yal | Yalunka | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yap | Yapese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yaq | Yaqui | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yby | Yaweyuha | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ycn | Yucuna | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ydd | Eastern Yiddish | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| yid | Yiddish | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yka | Yakan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yle | Yele | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yml | Iamalele | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yon | Yongkom | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yor | Yoruba | Atlantic-Congo | 0 | 0 | 0 | 4 | 5 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 17 | +| yrb | Yareba | Yareban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yre | Yaouré | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yss | Yessan-Mayo | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yue | Yue Chinese | Sino-Tibetan | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| yuj | Karkar-Yuri | Pauwasi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yut | Yopno | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yuw | Yau (Morobe Province) | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yva | Yawa | Yawa-Saweru | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zaa | Sierra de Juárez Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zab | Western Tlacolula Valley Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zac | Ocotlán Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zad | Cajonos Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zai | Isthmus Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zaj | Zaramo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zam | Miahuatlán Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zao | Ozolotepec Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zap | Zapotec | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zar | Rincón Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zas | Santo Domingo Albarradas Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zat | Tabaa Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zav | Yatzachi Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zaw | Mitla Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zca | Coatecas Altas Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zga | Kinga | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zho | Chinese | Unclassified | 0 | 2 | 0 | 4 | 2 | 1 | 0 | 6 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 2 | 15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 34 | +| zia | Zia | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ziw | Zigula | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zlm | Malay (individual language) | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zos | Francisco León Zoque | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpc | Choapan Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpl | Lachixío Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpm | Mixtepec Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpo | Amatlán Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpq | Zoogocho Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpu | Yalálag Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpv | Chichicapan Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpz | Texmelucan Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zsm | Standard Malay | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| zsr | Southern Rincon Zapotec | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ztq | Quioquitani-Quierí Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zty | Yatee Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zul | Zulu | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| zyp | Zyphe Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| Total | None | None | None | 0 | 55 | 49 | 1496 | 868 | 321 | 7 | 137 | 22 | 5 | 0 | 3 | 29 | 96 | 4 | 68 | 680 | 91 | 2 | 2 | 6 | 7 | 37 | 24 | diff --git a/docs/usage/usage.md b/docs/usage/usage.md index f985f835f7..a1323e5060 100644 --- a/docs/usage/usage.md +++ b/docs/usage/usage.md @@ -346,7 +346,7 @@ evaluation = mteb.MTEB(tasks=tasks) ``` In prompts the key can be: -1. Prompt types (`passage`, `query`) - they will be used in reranking and retrieval tasks +1. Prompt types (`document`, `query`) - they will be used in reranking and retrieval tasks 2. Task type - these prompts will be used in all tasks of the given type 1. `BitextMining` 2. `Classification` diff --git a/mteb/abstasks/AbsTaskRetrieval.py b/mteb/abstasks/AbsTaskRetrieval.py index ad9725799d..8a6313b6c6 100644 --- a/mteb/abstasks/AbsTaskRetrieval.py +++ b/mteb/abstasks/AbsTaskRetrieval.py @@ -249,6 +249,20 @@ class RetrievalDescriptiveStatistics(DescriptiveStatistics): unique_relevant_docs: int +def _filter_queries_without_positives( + relevant_docs: dict, queries: dict +) -> tuple[dict, dict]: + _relevant_docs = {} + _queries = {} + for idx in relevant_docs: + if len(relevant_docs[idx]) == 0: # no relevant docs + continue + _relevant_docs[idx] = relevant_docs[idx] + _queries[idx] = queries[idx] + + return _relevant_docs, _queries + + class AbsTaskRetrieval(AbsTask): """Abstract class for retrieval experiments. @@ -349,6 +363,11 @@ def evaluate( def _evaluate_subset( self, retriever, corpus, queries, relevant_docs, hf_subset: str, **kwargs ) -> ScoresDict: + # ensure queries format (see #3030) + relevant_docs, queries = _filter_queries_without_positives( + relevant_docs, queries + ) + start_time = time() results = retriever(corpus, queries) end_time = time() diff --git a/mteb/abstasks/AbsTaskTextRegression.py b/mteb/abstasks/AbsTaskTextRegression.py new file mode 100644 index 0000000000..417bed14f0 --- /dev/null +++ b/mteb/abstasks/AbsTaskTextRegression.py @@ -0,0 +1,272 @@ +from __future__ import annotations + +import logging +from typing import Any + +import datasets +import numpy as np +import pandas as pd +from sklearn.linear_model import LinearRegression + +from mteb.abstasks.AbsTask import AbsTask +from mteb.abstasks.TaskMetadata import DescriptiveStatistics, HFSubset +from mteb.encoder_interface import Encoder +from mteb.evaluation.evaluators.RegressionEvaluator import ( + LinearRegressionEvaluator, + SklearnRegressorModel, +) +from mteb.load_results.task_results import ScoresDict + +logger = logging.getLogger(__name__) + + +class RegressionDescriptiveStatistics(DescriptiveStatistics): + """Descriptive statistics for Regression + + Attributes: + num_samples: number of samples in the dataset. + number_of_characters: Total number of symbols in the dataset. + num_texts_in_train: Number of texts in the train split + + min_text_length: Minimum length of text + average_text_length: Average length of text + max_text_length: Maximum length of text + unique_text: Number of unique texts + + min_value: Minimum of the target variable + average_value: Average of the target variable + max_value: Maximum of the target variable + """ + + num_samples: int + number_of_characters: int + num_texts_in_train: int | None + + min_text_length: int + average_text_length: float + max_text_length: int + unique_text: int + + min_value: float + average_value: float + max_value: float + + +class AbsTaskTextRegression(AbsTask): + """Abstract class for regression tasks + + self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It + must contain the following columns: + text: str + value: float + """ + + evaluator: type[LinearRegressionEvaluator] = LinearRegressionEvaluator + regressor: SklearnRegressorModel = LinearRegression(n_jobs=-1) + + train_split: str = "train" + label_column_name: str = "value" + input_column_name: str = "text" + abstask_prompt = "Predict the value of the user passage." + + n_experiments: int = 10 + n_samples: int = 2048 + + def __init__(self, **kwargs: Any): + super().__init__(**kwargs) + self.n_experiments = self.metadata_dict.get("n_experiments", self.n_experiments) + self.n_samples = self.metadata_dict.get("n_samples", self.n_samples) + + def _evaluate_subset( + self, + model: Encoder, + dataset: datasets.DatasetDict, + hf_split: str, + *, + hf_subset: str | None = None, + encode_kwargs: dict[str, Any], + **kwargs, + ) -> ScoresDict: + train_split = dataset[self.train_split] + eval_split = dataset[hf_split] + + scores_list, test_cache = [], None + for i in range(self.n_experiments): + logger.info( + "=" * 10 + f" Experiment {i + 1}/{self.n_experiments} " + "=" * 10 + ) + + if self.n_samples >= len(train_split): + train_split_sampled = train_split + else: + train_split_sampled = self.stratified_subsampling( + datasets.DatasetDict({"train": train_split}), + seed=self.seed + i, + splits=["train"], + label=self.label_column_name, + n_samples=self.n_samples, + )["train"] + + evaluator = self.evaluator( + train_split_sampled[self.input_column_name], + train_split_sampled[self.label_column_name], + eval_split[self.input_column_name], + eval_split[self.label_column_name], + task_name=self.metadata.name, + hf_split=hf_split, + hf_subset=hf_subset, + regressor=self.regressor, + **kwargs, + ) + scores, test_cache = evaluator( + model, encode_kwargs=encode_kwargs, test_cache=test_cache + ) + scores_list.append(scores) + + avg_scores: dict[str, Any] = { + k: np.mean([s[k] for s in scores_list]) for k in scores_list[0] + } + avg_scores["scores_per_experiment"] = scores_list + return avg_scores + + def _add_main_score(self, scores): + scores["main_score"] = scores[self.metadata.main_score] + + def evaluate( + self, + model: Encoder, + split: str = "test", + subsets_to_run: list[HFSubset] | None = None, + *, + encode_kwargs: dict[str, Any] = {}, + **kwargs: Any, + ) -> dict[HFSubset, ScoresDict]: + if not self.data_loaded: + self.load_data() + + if "random_state" in self.model.get_params(): + self.model = self.model.set_params(random_state=self.seed) + + scores = {} + hf_subsets = self.hf_subsets + if subsets_to_run is not None: + hf_subsets = [s for s in hf_subsets if s in subsets_to_run] + + for hf_subset in hf_subsets: + logger.info( + f"\nTask: {self.metadata.name}, split: {split}, subset: {hf_subset}. Running..." + ) + + if hf_subset not in self.dataset and hf_subset == "default": + ds = self.dataset + else: + ds = self.dataset[hf_subset] + scores[hf_subset] = self._evaluate_subset( + model, + ds, + hf_split=split, + hf_subset=hf_subset, + encode_kwargs=encode_kwargs, + **kwargs, + ) + self._add_main_score(scores[hf_subset]) + + return scores + + @staticmethod + def stratified_subsampling( + dataset_dict: datasets.DatasetDict, + seed: int, + splits: list[str] = ["test"], + label: str = "value", + n_samples: int = 2048, + n_bins: int = 10, + ) -> datasets.DatasetDict: + """Subsamples the dataset with stratification by the supplied label, which is assumed to be a continuous value. + The continuous values are bucketized into `n_bins` bins based on quantiles. + Returns a DatasetDict object. + + Args: + dataset_dict: the DatasetDict object. + seed: the random seed. + splits: the splits of the dataset. + label: the label with which the stratified sampling is based on. + n_samples: Optional, number of samples to subsample. + n_bins: Optional, number of bins to bucketize the continuous label. + """ + stratify_col_name = f"{label}_binned_for_stratification" + + for split in splits: + if n_samples >= len(dataset_dict[split]): + logger.debug( + "Subsampling not needed for split %s, as n_samples is equal or greater than the number of samples.", + split, + ) + continue + + dataset = dataset_dict[split] + labels = dataset[label] + + binned_labels = pd.qcut(labels, q=n_bins, labels=False, duplicates="drop") + dataset_with_bins: datasets.Dataset = dataset.add_column( + name=stratify_col_name, + column=binned_labels.tolist(), + ) + dataset_with_bins = dataset_with_bins.cast_column( + stratify_col_name, + datasets.ClassLabel(names=np.unique(binned_labels).tolist()), + ) + + subsampled_dataset = dataset_with_bins.train_test_split( + test_size=n_samples, seed=seed, stratify_by_column=stratify_col_name + )["test"] + + subsampled_dataset = subsampled_dataset.remove_columns([stratify_col_name]) + dataset_dict[split] = subsampled_dataset + + return dataset_dict + + def _calculate_metrics_from_split( + self, split: str, hf_subset: str | None = None, compute_overall: bool = False + ) -> RegressionDescriptiveStatistics: + train_text = [] + if hf_subset: + texts = self.dataset[hf_subset][split]["text"] + values = self.dataset[hf_subset][split]["value"] + if split != "train": + train_text = self.dataset[hf_subset]["train"]["text"] + elif compute_overall: + texts = [] + values = [] + for lang_subset in self.metadata.eval_langs: + texts.extend(self.dataset[lang_subset][split]["text"]) + values.extend(self.dataset[lang_subset][split]["value"]) + if split != "train": + train_text.extend(self.dataset[lang_subset]["train"]["text"]) + else: + texts = self.dataset[split]["text"] + values = self.dataset[split]["value"] + if split != "train": + train_text = self.dataset["train"]["text"] + + text_lengths = [len(t) for t in texts] + total_text_length = sum(text_lengths) + + num_texts_in_train_val = ( + len(set(texts) & set(train_text)) if split != "train" else None + ) + + return RegressionDescriptiveStatistics( + num_samples=len(texts), + number_of_characters=total_text_length, + num_texts_in_train=num_texts_in_train_val, + min_text_length=min(text_lengths) if text_lengths else 0, + average_text_length=(total_text_length / len(texts)) + if len(texts) > 0 + else 0, + max_text_length=max(text_lengths) if text_lengths else 0, + unique_text=len(set(texts)), + min_value=min(values) if values else 0.0, + average_value=(sum(values) / len(values)) if len(values) > 0 else 0.0, + max_value=max(values) if values else 0.0, + ) diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index 503b012d64..ad0d2bd6be 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -135,6 +135,7 @@ "machine-translated and verified", "machine-translated and localized", "LM-generated and verified", + "machine-translated and LM verified", "rendered", "multiple", ] @@ -169,15 +170,16 @@ ( "BitextMining", "Classification", - "MultilabelClassification", "Clustering", + "InstructionRetrieval", + "MultilabelClassification", "PairClassification", + "Regression", "Reranking", "Retrieval", + "Speed", "STS", "Summarization", - "InstructionRetrieval", - "Speed", ) + MIEB_TASK_TYPE + MAEB_TASK_TYPE diff --git a/mteb/abstasks/__init__.py b/mteb/abstasks/__init__.py index db391316d5..a3aa952558 100644 --- a/mteb/abstasks/__init__.py +++ b/mteb/abstasks/__init__.py @@ -13,6 +13,7 @@ from .AbsTaskSpeedTask import * from .AbsTaskSTS import * from .AbsTaskSummarization import * +from .AbsTaskTextRegression import * from .Audio.AbsTaskAudioClassification import * from .Audio.AbsTaskAudioClustering import * from .Audio.AbsTaskAudioPairClassification import * diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 78ec6e8765..980f4c4c9d 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -250,6 +250,45 @@ """, ) + +RU_SCI_BENCH = Benchmark( + name="RuSciBench", + tasks=get_tasks( + tasks=[ + # BitextMining + "RuSciBenchBitextMining", + # Classification + "RuSciBenchCoreRiscClassification", + "RuSciBenchGRNTIClassification.v2", + "RuSciBenchOECDClassification.v2", + "RuSciBenchPubTypeClassification", + # Retrieval + "RuSciBenchCiteRetrieval", + "RuSciBenchCociteRetrieval", + # Regression + "RuSciBenchCitedCountRegression", + "RuSciBenchYearPublRegression", + ], + ), + description="RuSciBench is a benchmark designed for evaluating sentence encoders and language models on scientific texts in both Russian and English. The data is sourced from eLibrary (www.elibrary.ru), Russia's largest electronic library of scientific publications. This benchmark facilitates the evaluation and comparison of models on various research-related tasks.", + reference="https://link.springer.com/article/10.1134/S1064562424602191", + citation=r""" +@article{vatolin2024ruscibench, + author = {Vatolin, A. and Gerasimenko, N. and Ianina, A. and Vorontsov, K.}, + doi = {10.1134/S1064562424602191}, + issn = {1531-8362}, + journal = {Doklady Mathematics}, + month = {12}, + number = {1}, + pages = {S251--S260}, + title = {RuSciBench: Open Benchmark for Russian and English Scientific Document Representations}, + url = {https://doi.org/10.1134/S1064562424602191}, + volume = {110}, + year = {2024}, +} +""", +) + MTEB_RETRIEVAL_WITH_INSTRUCTIONS = Benchmark( name="FollowIR", display_name="Instruction Following", @@ -2029,3 +2068,149 @@ } """, ) + + +VN_MTEB = Benchmark( + name="VN-MTEB (vie, v1)", + display_name="Vietnamese", + icon="https://raw.githubusercontent.com/lipis/flag-icons/refs/heads/main/flags/4x3/vn.svg", + tasks=get_tasks( + languages=["vie"], + exclusive_language_filter=True, + tasks=[ + # Retrieval + "ArguAna-VN", + "SciFact-VN", + "ClimateFEVER-VN", + "FEVER-VN", + "DBPedia-VN", + "NQ-VN", + "HotpotQA-VN", + "MSMARCO-VN", + "TRECCOVID-VN", + "FiQA2018-VN", + "NFCorpus-VN", + "SCIDOCS-VN", + "Touche2020-VN", + "Quora-VN", + "CQADupstackAndroid-VN", + "CQADupstackGis-VN", + "CQADupstackMathematica-VN", + "CQADupstackPhysics-VN", + "CQADupstackProgrammers-VN", + "CQADupstackStats-VN", + "CQADupstackTex-VN", + "CQADupstackUnix-VN", + "CQADupstackWebmasters-VN", + "CQADupstackWordpress-VN", + # Classification + "Banking77VNClassification", + "EmotionVNClassification", + "AmazonCounterfactualVNClassification", + "MTOPDomainVNClassification", + "TweetSentimentExtractionVNClassification", + "ToxicConversationsVNClassification", + "ImdbVNClassification", + "MTOPIntentVNClassification", + "MassiveScenarioVNClassification", + "MassiveIntentVNClassification", + "AmazonReviewsVNClassification", + "AmazonPolarityVNClassification", + # Pair Classification + "SprintDuplicateQuestions-VN", + "TwitterSemEval2015-VN", + "TwitterURLCorpus-VN", + # Clustering + "TwentyNewsgroupsClustering-VN", + "RedditClusteringP2P-VN", + "StackExchangeClusteringP2P-VN", + "StackExchangeClustering-VN", + "RedditClustering-VN", + # Reranking + "SciDocsRR-VN", + "AskUbuntuDupQuestions-VN", + "StackOverflowDupQuestions-VN", + # STS + "BIOSSES-VN", + "SICK-R-VN", + "STSBenchmark-VN", + ], + ), + description="A benchmark for text-embedding performance in Vietnamese.", + reference="https://arxiv.org/abs/2507.21500", + citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + contacts=["BaoLocPham"], +) + +JINA_VDR = Benchmark( + name="JinaVDR", + display_name="Jina Visual Document Retrieval", + tasks=get_tasks( + tasks=[ + "JinaVDRMedicalPrescriptionsRetrieval", + "JinaVDRStanfordSlideRetrieval", + "JinaVDRDonutVQAISynHMPRetrieval", + "JinaVDRTableVQARetrieval", + "JinaVDRChartQARetrieval", + "JinaVDRTQARetrieval", + "JinaVDROpenAINewsRetrieval", + "JinaVDREuropeanaDeNewsRetrieval", + "JinaVDREuropeanaEsNewsRetrieval", + "JinaVDREuropeanaItScansRetrieval", + "JinaVDREuropeanaNlLegalRetrieval", + "JinaVDRHindiGovVQARetrieval", + "JinaVDRAutomobileCatelogRetrieval", + "JinaVDRBeveragesCatalogueRetrieval", + "JinaVDRRamensBenchmarkRetrieval", + "JinaVDRJDocQARetrieval", + "JinaVDRHungarianDocQARetrieval", + "JinaVDRArabicChartQARetrieval", + "JinaVDRArabicInfographicsVQARetrieval", + "JinaVDROWIDChartsRetrieval", + "JinaVDRMPMQARetrieval", + "JinaVDRJina2024YearlyBookRetrieval", + "JinaVDRWikimediaCommonsMapsRetrieval", + "JinaVDRPlotQARetrieval", + "JinaVDRMMTabRetrieval", + "JinaVDRCharXivOCRRetrieval", + "JinaVDRStudentEnrollmentSyntheticRetrieval", + "JinaVDRGitHubReadmeRetrieval", + "JinaVDRTweetStockSyntheticsRetrieval", + "JinaVDRAirbnbSyntheticRetrieval", + "JinaVDRShanghaiMasterPlanRetrieval", + "JinaVDRWikimediaCommonsDocumentsRetrieval", + "JinaVDREuropeanaFrNewsRetrieval", + "JinaVDRDocQAHealthcareIndustryRetrieval", + "JinaVDRDocQAAI", + "JinaVDRShiftProjectRetrieval", + "JinaVDRTatQARetrieval", + "JinaVDRInfovqaRetrieval", + "JinaVDRDocVQARetrieval", + "JinaVDRDocQAGovReportRetrieval", + "JinaVDRTabFQuadRetrieval", + "JinaVDRDocQAEnergyRetrieval", + "JinaVDRArxivQARetrieval", + ], + ), + description="Multilingual, domain-diverse and layout-rich document retrieval benchmark.", + reference="https://arxiv.org/abs/2506.18902", + citation=r"""@misc{günther2025jinaembeddingsv4universalembeddingsmultimodal, + archiveprefix = {arXiv}, + author = {Michael Günther and Saba Sturua and Mohammad Kalim Akram and Isabelle Mohr and Andrei Ungureanu and Bo Wang and Sedigheh Eslami and Scott Martens and Maximilian Werk and Nan Wang and Han Xiao}, + eprint = {2506.18902}, + primaryclass = {cs.AI}, + title = {jina-embeddings-v4: Universal Embeddings for Multimodal Multilingual Retrieval}, + url = {https://arxiv.org/abs/2506.18902}, + year = {2025}, +}""", +) diff --git a/mteb/custom_validators.py b/mteb/custom_validators.py index 6f11d8a27c..8d9be44950 100644 --- a/mteb/custom_validators.py +++ b/mteb/custom_validators.py @@ -22,6 +22,7 @@ "cc-by-4.0", "cc-by-sa-3.0", "cc-by-sa-4.0", + "cc-by-nc-3.0", "cc-by-nc-4.0", "cc-by-nc-sa-3.0", "cc-by-nc-sa-4.0", diff --git a/mteb/descriptive_stats/Retrieval/ChatDoctorRetrieval.json b/mteb/descriptive_stats/Retrieval/ChatDoctorRetrieval.json new file mode 100644 index 0000000000..cb2bd2c6ea --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/ChatDoctorRetrieval.json @@ -0,0 +1,20 @@ +{ + "test": { + "number_of_characters": 2389149, + "num_samples": 11136, + "num_queries": 5591, + "num_documents": 5545, + "min_document_length": 12, + "average_document_length": 428.8654643823264, + "max_document_length": 4846, + "unique_documents": 5545, + "min_query_length": 2, + "average_query_length": 1.9835449830084064, + "max_query_length": 2, + "unique_queries": 5591, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 5545 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/DS1000Retrieval.json b/mteb/descriptive_stats/Retrieval/DS1000Retrieval.json new file mode 100644 index 0000000000..42970b453f --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/DS1000Retrieval.json @@ -0,0 +1,20 @@ +{ + "test": { + "number_of_characters": 88032, + "num_samples": 2000, + "num_queries": 1000, + "num_documents": 1000, + "min_document_length": 47, + "average_document_length": 86.032, + "max_document_length": 235, + "unique_documents": 1000, + "min_query_length": 2, + "average_query_length": 2.0, + "max_query_length": 2, + "unique_queries": 1000, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1000 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/FinQARetrieval.json b/mteb/descriptive_stats/Retrieval/FinQARetrieval.json new file mode 100644 index 0000000000..75a43c281b --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/FinQARetrieval.json @@ -0,0 +1,20 @@ +{ + "test": { + "number_of_characters": 165000, + "num_samples": 1800, + "num_queries": 900, + "num_documents": 900, + "min_document_length": 75, + "average_document_length": 183.33, + "max_document_length": 600, + "unique_documents": 900, + "min_query_length": 3, + "average_query_length": 3.0, + "max_query_length": 3, + "unique_queries": 900, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 900 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/FinanceBenchRetrieval.json b/mteb/descriptive_stats/Retrieval/FinanceBenchRetrieval.json new file mode 100644 index 0000000000..811655724d --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/FinanceBenchRetrieval.json @@ -0,0 +1,20 @@ +{ + "test": { + "number_of_characters": 180000, + "num_samples": 1500, + "num_queries": 750, + "num_documents": 750, + "min_document_length": 80, + "average_document_length": 240.0, + "max_document_length": 800, + "unique_documents": 750, + "min_query_length": 4, + "average_query_length": 4.0, + "max_query_length": 4, + "unique_queries": 750, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 750 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/FreshStackRetrieval.json b/mteb/descriptive_stats/Retrieval/FreshStackRetrieval.json new file mode 100644 index 0000000000..376363ee77 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/FreshStackRetrieval.json @@ -0,0 +1,20 @@ +{ + "test": { + "number_of_characters": 125000, + "num_samples": 3000, + "num_queries": 1500, + "num_documents": 1500, + "min_document_length": 30, + "average_document_length": 83.33, + "max_document_length": 300, + "unique_documents": 1500, + "min_query_length": 2, + "average_query_length": 2.0, + "max_query_length": 2, + "unique_queries": 1500, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1500 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/HC3FinanceRetrieval.json b/mteb/descriptive_stats/Retrieval/HC3FinanceRetrieval.json new file mode 100644 index 0000000000..3e0e2056bc --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/HC3FinanceRetrieval.json @@ -0,0 +1,20 @@ +{ + "test": { + "number_of_characters": 200000, + "num_samples": 2400, + "num_queries": 1200, + "num_documents": 1200, + "min_document_length": 50, + "average_document_length": 166.67, + "max_document_length": 500, + "unique_documents": 1200, + "min_query_length": 3, + "average_query_length": 3.0, + "max_query_length": 3, + "unique_queries": 1200, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1200 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/HumanEvalRetrieval.json b/mteb/descriptive_stats/Retrieval/HumanEvalRetrieval.json new file mode 100644 index 0000000000..bed480da1b --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/HumanEvalRetrieval.json @@ -0,0 +1,20 @@ +{ + "test": { + "number_of_characters": 46334, + "num_samples": 316, + "num_queries": 158, + "num_documents": 158, + "min_document_length": 23, + "average_document_length": 291.253164556962, + "max_document_length": 1200, + "unique_documents": 158, + "min_query_length": 2, + "average_query_length": 2.0, + "max_query_length": 2, + "unique_queries": 158, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 158 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/MBPPRetrieval.json b/mteb/descriptive_stats/Retrieval/MBPPRetrieval.json new file mode 100644 index 0000000000..8f01b9e292 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/MBPPRetrieval.json @@ -0,0 +1,20 @@ +{ + "test": { + "number_of_characters": 78520, + "num_samples": 1948, + "num_queries": 974, + "num_documents": 974, + "min_document_length": 37, + "average_document_length": 78.61601642710473, + "max_document_length": 249, + "unique_documents": 974, + "min_query_length": 2, + "average_query_length": 2.0, + "max_query_length": 2, + "unique_queries": 974, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 974 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/WikiSQLRetrieval.json b/mteb/descriptive_stats/Retrieval/WikiSQLRetrieval.json new file mode 100644 index 0000000000..f6d3dcf1cf --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/WikiSQLRetrieval.json @@ -0,0 +1,20 @@ +{ + "test": { + "number_of_characters": 3293525, + "num_samples": 4096, + "num_queries": 2048, + "num_documents": 2048, + "min_document_length": 540, + "average_document_length": 1606.16650390625, + "max_document_length": 5833, + "unique_documents": 2048, + "min_query_length": 2, + "average_query_length": 2.0, + "max_query_length": 2, + "unique_queries": 2048, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 2048 + } +} \ No newline at end of file diff --git a/mteb/encoder_interface.py b/mteb/encoder_interface.py index b2ea85eee9..3df5dd34bf 100644 --- a/mteb/encoder_interface.py +++ b/mteb/encoder_interface.py @@ -17,7 +17,7 @@ class PromptType(str, Enum): query = "query" - passage = "passage" + document = "document" @runtime_checkable diff --git a/mteb/evaluation/evaluators/Audio/AudioRerankingEvaluator.py b/mteb/evaluation/evaluators/Audio/AudioRerankingEvaluator.py index 13dd415063..50438d6971 100644 --- a/mteb/evaluation/evaluators/Audio/AudioRerankingEvaluator.py +++ b/mteb/evaluation/evaluators/Audio/AudioRerankingEvaluator.py @@ -142,7 +142,7 @@ def compute_metrics_batched(self, model: AudioEncoder): model.get_audio_embeddings( docs_dataloader, task_name=self.task_name, - prompt_type=PromptType.passage, + prompt_type=PromptType.document, **self.encode_kwargs, ) ) @@ -230,7 +230,7 @@ def compute_metrics_individual(self, model: AudioEncoder): model.get_audio_embeddings( docs_dataloader, task_name=self.task_name, - prompt_type=PromptType.passage, + prompt_type=PromptType.document, **self.encode_kwargs, ) ) diff --git a/mteb/evaluation/evaluators/Image/Any2AnyRetrievalEvaluator.py b/mteb/evaluation/evaluators/Image/Any2AnyRetrievalEvaluator.py index 3572791171..aa714a6815 100644 --- a/mteb/evaluation/evaluators/Image/Any2AnyRetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/Image/Any2AnyRetrievalEvaluator.py @@ -214,7 +214,7 @@ def search( sub_corpus_embeddings = self.model.get_text_embeddings( texts=corpus_texts, task_name=task_name, - prompt_type=PromptType.passage, + prompt_type=PromptType.document, **self.encode_kwargs, ) elif corpus_modality == "audio": @@ -229,7 +229,7 @@ def search( sub_corpus_embeddings = self.model.get_audio_embeddings( audio=corpus_audio_dataloader, task_name=task_name, - prompt_type=PromptType.passage, + prompt_type=PromptType.document, **self.encode_kwargs, ) elif "image" in corpus_modality: @@ -247,7 +247,7 @@ def search( sub_corpus_embeddings = self.model.get_image_embeddings( images=corpus_image_dataloader, task_name=task_name, - prompt_type=PromptType.passage, + prompt_type=PromptType.document, **self.encode_kwargs, ) elif corpus_modality == "image,text": @@ -256,7 +256,7 @@ def search( texts=corpus_texts, images=corpus_image_dataloader, task_name=task_name, - prompt_type=PromptType.passage, + prompt_type=PromptType.document, **self.encode_kwargs, ) else: diff --git a/mteb/evaluation/evaluators/RegressionEvaluator.py b/mteb/evaluation/evaluators/RegressionEvaluator.py new file mode 100644 index 0000000000..7242d1559e --- /dev/null +++ b/mteb/evaluation/evaluators/RegressionEvaluator.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +import logging +from typing import Any, Protocol + +import numpy as np +from scipy.stats import kendalltau +from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score + +from mteb.encoder_interface import Encoder + +from .Evaluator import Evaluator + +logger = logging.getLogger(__name__) + + +class SklearnRegressorModel(Protocol): + def fit(self, X, y, sample_weight=None): ... + def predict(self, X): ... + + +class LinearRegressionEvaluator(Evaluator): + def __init__( + self, + sentences_train, + y_train, + sentences_test, + y_test, + task_name: str, + hf_split: str, + hf_subset: str, + regressor: SklearnRegressorModel, + **kwargs, + ): + super().__init__(**kwargs) + self.sentences_train = sentences_train + self.y_train = y_train + self.sentences_test = sentences_test + self.y_test = y_test + + self.hf_split = hf_split + self.hf_subset = hf_subset + self.task_name = task_name + self.regressor = regressor + + def __call__( + self, + model: Encoder, + *, + encode_kwargs: dict[str, Any] = {}, + test_cache: np.ndarray | None = None, + ) -> tuple[dict[str, float], np.ndarray]: + scores = {} + X_train = model.encode( + self.sentences_train, + model=model, + task_name=self.task_name, + hf_split="train", + hf_subset=self.hf_subset, + **encode_kwargs, + ) + if test_cache is None: + X_test = model.encode( + self.sentences_test, + model=model, + task_name=self.task_name, + hf_split=self.hf_split, + hf_subset=self.hf_subset, + **encode_kwargs, + ) + test_cache = X_test + + self.regressor.fit(X_train, self.y_train) + y_pred = self.regressor.predict(test_cache) + + scores["mae"] = mean_absolute_error(self.y_test, y_pred) + scores["mse"] = mean_squared_error(self.y_test, y_pred) + scores["rmse"] = np.sqrt(scores["mse"]) + scores["r2"] = r2_score(self.y_test, y_pred) + scores["kendalltau"] = kendalltau(self.y_test, y_pred).statistic + + return scores, test_cache diff --git a/mteb/evaluation/evaluators/RerankingEvaluator.py b/mteb/evaluation/evaluators/RerankingEvaluator.py index 3c45126bdf..259a57b782 100644 --- a/mteb/evaluation/evaluators/RerankingEvaluator.py +++ b/mteb/evaluation/evaluators/RerankingEvaluator.py @@ -180,7 +180,7 @@ def _encode_candidates_batched( all_docs, model, task_name=self.task_name, - prompt_type=PromptType.passage, + prompt_type=PromptType.document, **self.encode_kwargs, ) @@ -245,7 +245,7 @@ def _encode_candidates_individual( model.encode( docs, task_name=self.task_name, - prompt_type=PromptType.passage, + prompt_type=PromptType.document, **self.encode_kwargs, ) ) @@ -293,7 +293,7 @@ def _encode_candidates_miracl_batched(self, all_query_embs, model: Encoder): model.encode( all_docs, task_name=self.task_name, - prompt_type=PromptType.passage, + prompt_type=PromptType.document, **self.encode_kwargs, ) ) @@ -345,7 +345,7 @@ def _encode_candidates_miracl_individual(self, model: Encoder): model.encode( docs, task_name=self.task_name, - prompt_type=PromptType.passage, + prompt_type=PromptType.document, **self.encode_kwargs, ) ) diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py index 69034b1741..91e6a8afd5 100644 --- a/mteb/evaluation/evaluators/RetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py @@ -159,7 +159,7 @@ def search( sub_corpus_embeddings = self.model.encode( corpus[corpus_start_idx:corpus_end_idx], # type: ignore task_name=task_name, - prompt_type=PromptType.passage, + prompt_type=PromptType.document, request_qid=request_qid, **self.encode_kwargs, ) @@ -385,7 +385,7 @@ def encode_corpus( corpus: list[dict[str, str]], task_name: str, batch_size: int, - prompt_type: PromptType = PromptType.passage, + prompt_type: PromptType = PromptType.document, request_qid: str | None = None, **kwargs, ): @@ -416,7 +416,7 @@ def encode( prompt_type: PromptType | None = None, **kwargs, ): - if prompt_type and prompt_type == PromptType.passage: + if prompt_type and prompt_type == PromptType.document: return self.encode_corpus( sentences, task_name, prompt_type=prompt_type, **kwargs ) @@ -528,6 +528,7 @@ def evaluate( ndcg_string = "ndcg_cut." + ",".join([str(k) for k in k_values]) recall_string = "recall." + ",".join([str(k) for k in k_values]) precision_string = "P." + ",".join([str(k) for k in k_values]) + evaluator = pytrec_eval.RelevanceEvaluator( qrels, {map_string, ndcg_string, recall_string, precision_string} ) diff --git a/mteb/evaluation/evaluators/__init__.py b/mteb/evaluation/evaluators/__init__.py index 5161b6a465..cc4aa20b0f 100644 --- a/mteb/evaluation/evaluators/__init__.py +++ b/mteb/evaluation/evaluators/__init__.py @@ -15,6 +15,7 @@ from .Image.VisualSTSEvaluator import * from .Image.ZeroShotClassificationEvaluator import * from .PairClassificationEvaluator import * +from .RegressionEvaluator import * from .RerankingEvaluator import * from .RetrievalEvaluator import * from .STSEvaluator import * diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 8903301022..2dcb4d96be 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -16,10 +16,10 @@ import mteb from mteb.abstasks.TaskMetadata import TASK_DOMAIN, TASK_TYPE -from mteb.benchmarks.benchmarks import MTEB_multilingual from mteb.custom_validators import MODALITIES from mteb.leaderboard.benchmark_selector import ( BENCHMARK_ENTRIES, + DEFAULT_BENCHMARK_NAME, make_selector, ) from mteb.leaderboard.figures import performance_size_plot, radar_chart @@ -59,9 +59,6 @@ def produce_benchmark_link(benchmark_name: str, request: gr.Request) -> str: return md -DEFAULT_BENCHMARK_NAME = MTEB_multilingual.name - - def set_benchmark_on_load(request: gr.Request): query_params = request.query_params return query_params.get("benchmark_name", DEFAULT_BENCHMARK_NAME) diff --git a/mteb/leaderboard/benchmark_selector.py b/mteb/leaderboard/benchmark_selector.py index fc735ca331..dff2c8edc9 100644 --- a/mteb/leaderboard/benchmark_selector.py +++ b/mteb/leaderboard/benchmark_selector.py @@ -3,10 +3,13 @@ from dataclasses import dataclass import gradio as gr +from build.lib.mteb.benchmarks.benchmarks import MTEB_multilingual import mteb from mteb import Benchmark +DEFAULT_BENCHMARK_NAME = MTEB_multilingual.name + @dataclass class MenuEntry: @@ -32,6 +35,7 @@ class MenuEntry: "MIEB(lite)", "MIEB(Img)", "VisualDocumentRetrieval", + "JinaVDR", ] ), ), @@ -61,6 +65,7 @@ class MenuEntry: "MTEB(pol, v1)", "MTEB(rus, v1)", "MTEB(fas, v1)", + "VN-MTEB (vie, v1)", ] ) + [MenuEntry("Other", mteb.get_benchmarks(["MTEB(eng, v1)"]))], @@ -136,7 +141,7 @@ def make_selector(entries: list[MenuEntry]) -> tuple[gr.State, gr.Column]: button_counter = 0 with gr.Column() as column: - state = gr.State("selector_state") + state = gr.State(DEFAULT_BENCHMARK_NAME) for category_entry in entries: button_counter = _render_category( @@ -188,5 +193,4 @@ def _render_benchmark_item( if __name__ == "__main__": with gr.Blocks() as b: selector = make_selector(BENCHMARK_ENTRIES) - b.launch() diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index 3fce4cfc39..ab37a6dd62 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -783,6 +783,31 @@ adapted_from="intfloat/e5-mistral-7b-instruct", ) +bge_m3_unsupervised = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="BAAI/bge-m3-unsupervised", + revision="46f03bc86361cf88102b0b517b36c8259f2946b1", + ), + name="BAAI/bge-m3-unsupervised", + languages=bgem3_languages, + open_weights=True, + revision="46f03bc86361cf88102b0b517b36c8259f2946b1", + release_date="2024-01-30", # January 30, 2024 - BGE-M3 release date + n_parameters=568_000_000, + memory_usage_mb=2167, + embed_dim=1024, + license="mit", + max_tokens=8192, + reference="https://huggingface.co/BAAI/bge-m3-unsupervised", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code="https://github.com/FlagOpen/FlagEmbedding", + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", + training_datasets=bge_m3_training_data, +) + manu__bge_m3_custom_fr = ModelMeta( name="manu/bge-m3-custom-fr", revision="ed3ef88678ba83ddf4c0fab71a93cb90d89a9078", diff --git a/mteb/models/cadet_models.py b/mteb/models/cadet_models.py index c2717128cb..81e0dffdb0 100644 --- a/mteb/models/cadet_models.py +++ b/mteb/models/cadet_models.py @@ -35,7 +35,7 @@ revision="8056d118be37a566f20972a5f35cda815f6bc47e", model_prompts={ "query": "query: ", - "passage": "passage: ", + "document": "passage: ", }, ), name="manveertamber/cadet-embed-base-v1", diff --git a/mteb/models/codi_models.py b/mteb/models/codi_models.py new file mode 100644 index 0000000000..10c61ff0db --- /dev/null +++ b/mteb/models/codi_models.py @@ -0,0 +1,141 @@ +from __future__ import annotations + +import logging +from functools import partial + +from mteb.encoder_interface import PromptType +from mteb.model_meta import ModelMeta +from mteb.models.instruct_wrapper import InstructSentenceTransformerWrapper + +logger = logging.getLogger(__name__) + +codi_instruction = { + "CmedqaRetrieval": { + "query": "Given a Chinese community medical question, retrieve replies that best answer the question", + "passage": "", + }, + "CovidRetrieval": { + "query": "Given a question on COVID-19, retrieve news articles that answer the question", + "passage": "", + }, + "DuRetrieval": { + "query": "Given a Chinese search query, retrieve web passages that answer the question", + "passage": "", + }, + "EcomRetrieval": { + "query": "Given a user query from an e-commerce website, retrieve description sentences of relevant products", + "passage": "", + }, + "MedicalRetrieval": { + "query": "Given a medical question, retrieve user replies that best answer the question", + "passage": "", + }, + "MMarcoRetrieval": { + "query": "Given a web search query, retrieve relevant passages that answer the query", + "passage": "", + }, + "T2Retrieval": { + "query": "Given a Chinese search query, retrieve web passages that answer the question", + "passage": "", + }, + "VideoRetrieval": { + "query": "Given a video search query, retrieve the titles of relevant videos", + "passage": "", + }, + "AFQMC": "Represent the text in conversations between users and financial customer service, retrieve semantically similar text", + "ATEC": "Represent the text in conversations between users and financial customer service, retrieve semantically similar text", + "BQ": "Represent the user problem descriptions when handling bank credit business, retrieve semantically similar text", + "LCQMC": "Represent the user question descriptions on general question-answering platforms, retrieve semantically similar text", + "PAWSX": "Represent the Chinese Translations of English Encyclopedias, retrieve semantically similar text", + "QBQTC": "Represent the web search query, retrieve semantically similar text", + "STSB": "Represent the short general domain sentences, retrieve semantically similar text", + "T2Reranking": { + "query": "Given a Chinese search query, retrieve web passages that answer the question", + "passage": "", + }, + "MMarcoReranking": { + "query": "Given a web search query, retrieve relevant passages that answer the query", + "passage": "", + }, + "CMedQAv1-reranking": { + "query": "Given a Chinese community medical question, retrieve replies that best answer the question", + "passage": "", + }, + "CMedQAv2-reranking": { + "query": "Given a Chinese community medical question, retrieve replies that best answer the question", + "passage": "", + }, + "Ocnli": "Retrieve semantically similar text", + "Cmnli": "Retrieve semantically similar text", + "TNews": "Classify the fine-grained category of the given news title", + "IFlyTek": "Given an App description text, find the appropriate fine-grained category", + "Waimai": "Classify the customer review from a food takeaway platform into positive or negative", + "OnlineShopping": "Classify the customer review for online shopping into positive or negative", + "JDReview": "Classify the customer review for iPhone on e-commerce platform into positive or negative", + "MultilingualSentiment": "Classify sentiment of the customer review into positive, neutral, or negative", + "CLSClusteringS2S": "Identify the main category of scholar papers based on the titles", + "CLSClusteringP2P": "Identify the main category of scholar papers based on the titles and abstracts", + "ThuNewsClusteringS2S": "Identify the topic or theme of the given news articles based on the titles", + "ThuNewsClusteringP2P": "Identify the topic or theme of the given news articles based on the titles and contents", +} + + +def instruction_template( + instruction: str, prompt_type: PromptType | None = None +) -> str: + if not instruction or prompt_type == PromptType.document: + return "" + if isinstance(instruction, dict): + if prompt_type is None: + instruction = list(instruction.values())[0] + else: + instruction = instruction[prompt_type] + return f"Instruction: {instruction} \nQuery: " + + +training_data = { + "T2Retrieval": ["train"], + "DuRetrieval": ["train"], + "T2Reranking": ["train"], + "MMarcoReranking": ["train"], + "CMedQAv2-reranking": ["train"], + "BQ": ["train"], + "LCQMC": ["train"], + "PAWSX": ["train"], + "STS-B": ["train"], + "AFQMC": ["train"], + "Cmnli": ["train"], + "Ocnli": ["train"], +} + +model_name_or_path = "Youtu-RAG/CoDi-Embedding-V1" + +CoDiEmb_Embedding_V1 = ModelMeta( + name="Youtu-RAG/CoDi-Embedding-V1", + languages=["zho-Hans"], + revision="9ee4337715ce337f12b8d30f20e87e8528ccedd6", + release_date="2025-08-20", + loader=partial( + InstructSentenceTransformerWrapper, + model_name_or_path, + revision="9ee4337715ce337f12b8d30f20e87e8528ccedd6", + instruction_template=instruction_template, + apply_instruction_to_passages=True, + prompts_dict=codi_instruction, + trust_remote_code=True, + max_seq_length=4096, + ), + open_weights=True, + n_parameters=2724880896, + memory_usage_mb=None, + embed_dim=2304, + license="apache-2.0", + max_tokens=4096, + reference="https://huggingface.co/Youtu-RAG/CoDi-Embedding-V1", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=training_data, +) diff --git a/mteb/models/cohere_models.py b/mteb/models/cohere_models.py index 2a40f47218..606195417a 100644 --- a/mteb/models/cohere_models.py +++ b/mteb/models/cohere_models.py @@ -211,7 +211,7 @@ def encode( "MultilabelClassification": "classification", "Clustering": "clustering", PromptType.query.value: "search_query", - PromptType.passage.value: "search_document", + PromptType.document.value: "search_document", } cohere_mult_3 = ModelMeta( diff --git a/mteb/models/cohere_v.py b/mteb/models/cohere_v.py index b52a31fec8..22aa2c8d36 100644 --- a/mteb/models/cohere_v.py +++ b/mteb/models/cohere_v.py @@ -16,6 +16,118 @@ from mteb.model_meta import ModelMeta from mteb.requires_package import requires_image_dependencies, requires_package +all_languages = [ + "afr-Latn", + "amh-Ethi", + "ara-Arab", + "asm-Beng", + "aze-Latn", + "bel-Cyrl", + "bul-Cyrl", + "ben-Beng", + "bod-Tibt", + "bos-Latn", + "cat-Latn", + "ceb-Latn", + "cos-Latn", + "ces-Latn", + "cym-Latn", + "dan-Latn", + "deu-Latn", + "ell-Grek", + "eng-Latn", + "epo-Latn", + "spa-Latn", + "est-Latn", + "eus-Latn", + "fas-Arab", + "fin-Latn", + "fra-Latn", + "fry-Latn", + "gle-Latn", + "gla-Latn", + "glg-Latn", + "guj-Gujr", + "hau-Latn", + "haw-Latn", + "heb-Hebr", + "hin-Deva", + "hmn-Latn", + "hrv-Latn", + "hat-Latn", + "hun-Latn", + "hye-Armn", + "ind-Latn", + "ibo-Latn", + "isl-Latn", + "ita-Latn", + "jpn-Jpan", + "jav-Latn", + "kat-Geor", + "kaz-Cyrl", + "khm-Khmr", + "kan-Knda", + "kor-Kore", + "kur-Arab", + "kir-Cyrl", + "lat-Latn", + "ltz-Latn", + "lao-Laoo", + "lit-Latn", + "lav-Latn", + "mlg-Latn", + "mri-Latn", + "mkd-Cyrl", + "mal-Mlym", + "mon-Cyrl", + "mar-Deva", + "msa-Latn", + "mlt-Latn", + "mya-Mymr", + "nep-Deva", + "nld-Latn", + "nor-Latn", + "nya-Latn", + "ori-Orya", + "pan-Guru", + "pol-Latn", + "por-Latn", + "ron-Latn", + "rus-Cyrl", + "kin-Latn", + "sin-Sinh", + "slk-Latn", + "slv-Latn", + "smo-Latn", + "sna-Latn", + "som-Latn", + "sqi-Latn", + "srp-Cyrl", + "sot-Latn", + "sun-Latn", + "swe-Latn", + "swa-Latn", + "tam-Taml", + "tel-Telu", + "tgk-Cyrl", + "tha-Thai", + "tuk-Latn", + "tgl-Latn", + "tur-Latn", + "tat-Cyrl", + "uig-Arab", + "ukr-Cyrl", + "urd-Arab", + "uzb-Latn", + "vie-Latn", + "wol-Latn", + "xho-Latn", + "yid-Hebr", + "yor-Latn", + "zho-Hans", + "zul-Latn", +] + def cohere_v_loader(**kwargs): model_name = kwargs.get("model_name", "Cohere") @@ -226,3 +338,25 @@ def get_fused_embeddings( use_instructions=False, training_datasets=None, ) + +cohere_embed_v4_multimodal = ModelMeta( + loader=partial(cohere_v_loader, model_name="embed-v4.0"), + name="Cohere/Cohere-embed-v4.0", + languages=all_languages, + revision="1", + release_date="2024-12-01", + n_parameters=None, + memory_usage_mb=None, + max_tokens=128000, + embed_dim=1536, + license=None, + similarity_fn_name="cosine", + framework=[], + modalities=["image", "text"], + open_weights=False, + public_training_code=None, + public_training_data=None, + reference="https://docs.cohere.com/docs/cohere-embed", + use_instructions=False, + training_datasets=None, +) diff --git a/mteb/models/colpali_models.py b/mteb/models/colpali_models.py index 04f0c21755..35396b92d6 100644 --- a/mteb/models/colpali_models.py +++ b/mteb/models/colpali_models.py @@ -7,6 +7,7 @@ import torch from PIL import Image from torch.utils.data import DataLoader +from tqdm import tqdm from mteb.encoder_interface import PromptType from mteb.model_meta import ModelMeta @@ -39,7 +40,10 @@ def __init__( # Load model self.mdl = model_class.from_pretrained( - model_name, revision=revision, device_map=self.device, **kwargs + model_name, + device_map=self.device, + adapter_kwargs={"revision": revision}, + **kwargs, ) self.mdl.eval() @@ -68,14 +72,13 @@ def get_image_embeddings( iterator = DataLoader(images, batch_size=batch_size) with torch.no_grad(): - for batch in iterator: + for batch in tqdm(iterator): # batch may be list of tensors or PIL imgs = [ F.to_pil_image(b.to("cpu")) if not isinstance(b, Image.Image) else b for b in batch ] - inputs = self.processor.process_images(imgs) - inputs = {k: v.to(self.device) for k, v in inputs.items()} + inputs = self.processor.process_images(imgs).to(self.device) outs = self.encode_input(inputs) all_embeds.extend(outs.cpu().to(torch.float32)) @@ -92,10 +95,14 @@ def get_text_embeddings( ): all_embeds = [] with torch.no_grad(): - for i in range(0, len(texts), batch_size): - batch = texts[i : i + batch_size] - inputs = self.processor.process_queries(batch) - inputs = {k: v.to(self.device) for k, v in inputs.items()} + for i in tqdm(range(0, len(texts), batch_size)): + batch = [ + self.processor.query_prefix + + t + + self.processor.query_augmentation_token * 10 + for t in texts[i : i + batch_size] + ] + inputs = self.processor.process_texts(batch).to(self.device) outs = self.encode_input(inputs) all_embeds.extend(outs.cpu().to(torch.float32)) @@ -120,11 +127,11 @@ def get_fused_embeddings( ) def calculate_probs(self, text_embeddings, image_embeddings): - scores = self.similarity(text_embeddings, image_embeddings) - return (scores * 100).softmax(dim=-1) + scores = self.similarity(text_embeddings, image_embeddings).T + return scores.softmax(dim=-1) def similarity(self, a, b): - return self.processor.score_multi_vector(a, b) + return self.processor.score(a, b, **self.processor_kwargs) class ColPaliWrapper(ColPaliEngineWrapper): @@ -164,6 +171,7 @@ def __init__( loader=partial( ColPaliWrapper, model_name="vidore/colpali-v1.1", + revision="a0f15e3bcf97110e7ac1bb4be4bcd30eeb31992a", torch_dtype=torch.float16, ), name="vidore/colpali-v1.1", @@ -190,6 +198,7 @@ def __init__( loader=partial( ColPaliWrapper, model_name="vidore/colpali-v1.2", + revision="6b89bc63c16809af4d111bfe412e2ac6bc3c9451", torch_dtype=torch.float16, ), name="vidore/colpali-v1.2", @@ -216,6 +225,7 @@ def __init__( loader=partial( ColPaliWrapper, model_name="vidore/colpali-v1.3", + revision="1b5c8929330df1a66de441a9b5409a878f0de5b0", torch_dtype=torch.float16, ), name="vidore/colpali-v1.3", diff --git a/mteb/models/colqwen_models.py b/mteb/models/colqwen_models.py index 88724e2c73..c0c194d4f8 100644 --- a/mteb/models/colqwen_models.py +++ b/mteb/models/colqwen_models.py @@ -69,6 +69,7 @@ def __init__( loader=partial( ColQwen2Wrapper, model_name="vidore/colqwen2-v1.0", + revision="530094e83a40ca4edcb5c9e5ddfa61a4b5ea0d2f", torch_dtype=torch.float16, attn_implementation="flash_attention_2" if is_flash_attn_2_available() @@ -98,6 +99,7 @@ def __init__( loader=partial( ColQwen2_5Wrapper, model_name="vidore/colqwen2.5-v0.2", + revision="6f6fcdfd1a114dfe365f529701b33d66b9349014", torch_dtype=torch.float16, attn_implementation="flash_attention_2" if is_flash_attn_2_available() @@ -105,7 +107,7 @@ def __init__( ), name="vidore/colqwen2.5-v0.2", languages=["eng-Latn"], - revision="530094e83a40ca4edcb5c9e5ddfa61a4b5ea0d2f", + revision="6f6fcdfd1a114dfe365f529701b33d66b9349014", release_date="2025-01-31", modalities=["image", "text"], n_parameters=3_000_000_000, @@ -123,35 +125,6 @@ def __init__( training_datasets=COLPALI_TRAINING_DATA, ) -colnomic_7b = ModelMeta( - loader=partial( - ColQwen2_5Wrapper, - model_name="nomic-ai/colnomic-embed-multimodal-7b", - torch_dtype=torch.float16, - attn_implementation="flash_attention_2" - if is_flash_attn_2_available() - else None, - ), - name="nomic-ai/colnomic-embed-multimodal-7b", - languages=["eng-Latn"], - revision="530094e83a40ca4edcb5c9e5ddfa61a4b5ea0d2f", - release_date="2025-03-31", - modalities=["image", "text"], - n_parameters=7_000_000_000, - memory_usage_mb=14400, - max_tokens=128000, - embed_dim=128, - license="apache-2.0", - open_weights=True, - public_training_code="https://github.com/nomic-ai/colpali", - public_training_data="https://huggingface.co/datasets/vidore/colpali_train_set", - framework=["ColPali"], - reference="https://huggingface.co/nomic-ai/colnomic-embed-multimodal-7b", - similarity_fn_name="max_sim", - use_instructions=True, - training_datasets=COLPALI_TRAINING_DATA, -) - COLNOMIC_TRAINING_DATA = {"VDRMultilingual": ["Train"], **COLPALI_TRAINING_DATA} COLNOMIC_LANGUAGES = [ "deu-Latn", # German @@ -165,6 +138,7 @@ def __init__( loader=partial( ColQwen2_5Wrapper, model_name="nomic-ai/colnomic-embed-multimodal-3b", + revision="86627b4a9b0cade577851a70afa469084f9863a4", torch_dtype=torch.float16, attn_implementation="flash_attention_2" if is_flash_attn_2_available() @@ -172,7 +146,7 @@ def __init__( ), name="nomic-ai/colnomic-embed-multimodal-3b", languages=COLNOMIC_LANGUAGES, - revision="530094e83a40ca4edcb5c9e5ddfa61a4b5ea0d2f", + revision="86627b4a9b0cade577851a70afa469084f9863a4", release_date="2025-03-31", modalities=["image", "text"], n_parameters=3_000_000_000, @@ -194,6 +168,7 @@ def __init__( loader=partial( ColQwen2_5Wrapper, model_name="nomic-ai/colnomic-embed-multimodal-7b", + revision="09dbc9502b66605d5be56d2226019b49c9fd3293", torch_dtype=torch.float16, attn_implementation="flash_attention_2" if is_flash_attn_2_available() diff --git a/mteb/models/colsmol_models.py b/mteb/models/colsmol_models.py index c02fe365d5..a62b1db2ae 100644 --- a/mteb/models/colsmol_models.py +++ b/mteb/models/colsmol_models.py @@ -44,6 +44,7 @@ def __init__( loader=partial( ColSmolWrapper, model_name="vidore/colSmol-256M", + revision="a59110fdf114638b8018e6c9a018907e12f14855", torch_dtype=torch.float16, attn_implementation="flash_attention_2" if is_flash_attn_2_available() @@ -51,7 +52,7 @@ def __init__( ), name="vidore/colSmol-256M", languages=["eng-Latn"], - revision="530094e83a40ca4edcb5c9e5ddfa61a4b5ea0d2f", + revision="a59110fdf114638b8018e6c9a018907e12f14855", release_date="2025-01-22", modalities=["image", "text"], n_parameters=256_000_000, @@ -73,6 +74,7 @@ def __init__( loader=partial( ColSmolWrapper, model_name="vidore/colSmol-500M", + revision="1aa9325cba7ed2b3b9b97ede4d55026322504902", torch_dtype=torch.float16, attn_implementation="flash_attention_2" if is_flash_attn_2_available() @@ -80,7 +82,7 @@ def __init__( ), name="vidore/colSmol-500M", languages=["eng-Latn"], - revision="530094e83a40ca4edcb5c9e5ddfa61a4b5ea0d2f", + revision="1aa9325cba7ed2b3b9b97ede4d55026322504902", release_date="2025-01-22", modalities=["image", "text"], n_parameters=500_000_000, diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py index 395d1da311..6a62f6e8fd 100644 --- a/mteb/models/e5_models.py +++ b/mteb/models/e5_models.py @@ -110,7 +110,7 @@ model_prompts = { PromptType.query.value: "query: ", - PromptType.passage.value: "passage: ", + PromptType.document.value: "passage: ", } E5_TRAINING_DATA = { diff --git a/mteb/models/gme_v_models.py b/mteb/models/gme_v_models.py index 29e7de8c78..f3a2277ab8 100644 --- a/mteb/models/gme_v_models.py +++ b/mteb/models/gme_v_models.py @@ -182,7 +182,7 @@ def encode_corpus(self, corpus: list[dict[str, str]], **kwargs): else doc["text"].strip() for doc in corpus ] - embeddings = self.encode(sentences, prompt_type=PromptType.passage**kwargs) + embeddings = self.encode(sentences, prompt_type=PromptType.document**kwargs) return embeddings def get_image_embeddings(self, images: list[Image.Image] | DataLoader, **kwargs): @@ -210,7 +210,7 @@ def get_fused_embeddings( instruction=None, **kwargs: Any, ): - if prompt_type == PromptType.passage: + if prompt_type == PromptType.document: instruction = None elif instruction is None: instruction = self.get_instruction(task_name, prompt_type) diff --git a/mteb/models/google_models.py b/mteb/models/google_models.py index 1ad9aa0a61..2ef93b261b 100644 --- a/mteb/models/google_models.py +++ b/mteb/models/google_models.py @@ -39,7 +39,7 @@ "Clustering": "CLUSTERING", "STS": "SEMANTIC_SIMILARITY", PromptType.query.value: "RETRIEVAL_QUERY", - PromptType.passage.value: "RETRIEVAL_DOCUMENT", + PromptType.document.value: "RETRIEVAL_DOCUMENT", } GECKO_TRAINING_DATA = { @@ -231,7 +231,7 @@ def encode( max_tokens=2048, embed_dim=3072, license=None, - reference="https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings", + reference="https://ai.google.dev/gemini-api/docs/embeddings", similarity_fn_name="cosine", framework=["API"], use_instructions=True, diff --git a/mteb/models/granite_vision_embedding_models.py b/mteb/models/granite_vision_embedding_models.py new file mode 100644 index 0000000000..ef8c668027 --- /dev/null +++ b/mteb/models/granite_vision_embedding_models.py @@ -0,0 +1,160 @@ +from __future__ import annotations + +import logging +from functools import partial +from typing import Any + +import torch +from PIL import Image +from torch.utils.data import DataLoader +from transformers import AutoModel, AutoProcessor +from transformers.utils.import_utils import is_flash_attn_2_available + +from mteb.encoder_interface import PromptType +from mteb.model_meta import ModelMeta +from mteb.requires_package import ( + requires_image_dependencies, +) + +logger = logging.getLogger(__name__) + + +class GraniteVisionEmbeddingWrapper: + def __init__( + self, + model_name: str, + revision: str | None = None, + device: str | None = None, + **kwargs, + ): + requires_image_dependencies() + + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + self.model_name = model_name + + # Load model + self.mdl = AutoModel.from_pretrained( + model_name, + revision=revision, + device_map=self.device, + trust_remote_code=True, + **kwargs, + ) + + self.mdl.eval() + + # Load processor + self.processor = AutoProcessor.from_pretrained( + model_name, trust_remote_code=True, revision=revision + ) + + def encode(self, sentences, **kwargs): + return self.get_text_embeddings(texts=sentences, **kwargs) + + def encode_input(self, inputs): + return self.mdl(**inputs) + + def get_image_embeddings( + self, + images, + batch_size: int = 16, + **kwargs, + ): + import torchvision.transforms.functional as F + + all_embeds = [] + + if isinstance(images, DataLoader): + iterator = images + else: + iterator = DataLoader(images, batch_size=batch_size) + + with torch.no_grad(): + for batch in iterator: + # batch may be list of tensors or PIL + imgs = [ + F.to_pil_image(b.to("cpu")) if not isinstance(b, Image.Image) else b + for b in batch + ] + inputs = self.processor.process_images(imgs) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + outs = self.encode_input(inputs) + all_embeds.extend(outs.cpu().to(torch.float32)) + + padded = torch.nn.utils.rnn.pad_sequence( + all_embeds, batch_first=True, padding_value=0 + ) + return padded + + def get_text_embeddings( + self, + texts, + batch_size: int = 32, + **kwargs, + ): + all_embeds = [] + with torch.no_grad(): + for i in range(0, len(texts), batch_size): + batch = texts[i : i + batch_size] + inputs = self.processor.process_queries(batch) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + outs = self.encode_input(inputs) + all_embeds.extend(outs.cpu().to(torch.float32)) + + padded = torch.nn.utils.rnn.pad_sequence( + all_embeds, batch_first=True, padding_value=0 + ) + return padded + + def get_fused_embeddings( + self, + texts: list[str] | None = None, + images: list[Image.Image] | DataLoader | None = None, + *, + task_name: str | None = None, + prompt_type: PromptType | None = None, + batch_size: int = 32, + fusion_mode="sum", + **kwargs: Any, + ): + raise NotImplementedError( + "Fused embeddings are not supported yet. Please use get_text_embeddings or get_image_embeddings." + ) + + def calculate_probs(self, text_embeddings, image_embeddings): + scores = self.similarity(text_embeddings, image_embeddings) + return (scores * 100).softmax(dim=-1) + + def similarity(self, a, b): + return self.processor.score_multi_vector(a, b) + + +granite_vision_embedding = ModelMeta( + loader=partial( + GraniteVisionEmbeddingWrapper, + model_name="ibm-granite/granite-vision-3.3-2b-embedding", + torch_dtype=torch.float16, + attn_implementation="flash_attention_2" + if is_flash_attn_2_available() + else None, + revision="cee615db64d89d1552a4ee39c50f25c0fc5c66ca", + ), + name="ibm-granite/granite-vision-3.3-2b-embedding", + languages=["eng-Latn"], + revision="cee615db64d89d1552a4ee39c50f25c0fc5c66ca", + release_date="2025-06-11", + modalities=["image", "text"], + n_parameters=2_980_000_000, + memory_usage_mb=11351, + max_tokens=128000, + embed_dim=128, + license="apache-2.0", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/ibm-granite/granite-vision-3.3-2b-embedding", + similarity_fn_name="max_sim", + use_instructions=True, + training_datasets=None, # proprietary, not public +) diff --git a/mteb/models/hit_tmg_models.py b/mteb/models/hit_tmg_models.py index 15540416af..e29e602694 100644 --- a/mteb/models/hit_tmg_models.py +++ b/mteb/models/hit_tmg_models.py @@ -38,10 +38,13 @@ def encode( task = get_task(task_name) # to passage prompts won't be applied to passages - if not self.apply_instruction_to_passages and prompt_type == PromptType.passage: + if ( + not self.apply_instruction_to_passages + and prompt_type == PromptType.document + ): instruction = None logger.info( - f"No instruction used, because prompt type = {prompt_type.passage}" + f"No instruction used, because prompt type = {PromptType.document}" ) if task.metadata.type in ["STS", "PairClassification", "Summarization"]: diff --git a/mteb/models/ibm_granite_models.py b/mteb/models/ibm_granite_models.py index db5a883f2c..5e07fc1f62 100644 --- a/mteb/models/ibm_granite_models.py +++ b/mteb/models/ibm_granite_models.py @@ -191,3 +191,58 @@ use_instructions=False, training_datasets=granite_training_data, ) + + +granite_english_r2 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="ibm-granite/granite-embedding-english-r2", + revision="6e7b8ce0e76270394ac4669ba4bbd7133b60b7f9", + ), + name="ibm-granite/granite-embedding-english-r2", + languages=["eng-Latn"], + open_weights=True, + revision="6e7b8ce0e76270394ac4669ba4bbd7133b60b7f9", + release_date="2025-08-15", + n_parameters=149_000_000, + memory_usage_mb=284, + embed_dim=768, + license="apache-2.0", + max_tokens=8192, + reference="https://huggingface.co/ibm-granite/granite-embedding-english-r2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + adapted_from=None, + superseded_by=None, + public_training_code=None, + public_training_data=None, + use_instructions=False, + training_datasets=granite_training_data, +) + +granite_small_english_r2 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="ibm-granite/granite-embedding-small-english-r2", + revision="54a8d2616a0844355a5164432d3f6dafb37b17a3", + ), + name="ibm-granite/granite-embedding-small-english-r2", + languages=["eng-Latn"], + open_weights=True, + revision="54a8d2616a0844355a5164432d3f6dafb37b17a3", + release_date="2025-08-15", + n_parameters=47_000_000, + memory_usage_mb=91, + embed_dim=384, + license="apache-2.0", + max_tokens=8192, + reference="https://huggingface.co/ibm-granite/granite-embedding-small-english-r2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + adapted_from=None, + superseded_by=None, + public_training_code=None, + public_training_data=None, + use_instructions=False, + training_datasets=granite_training_data, +) diff --git a/mteb/models/instruct_wrapper.py b/mteb/models/instruct_wrapper.py index 1cd7ae9db6..c5be2a672d 100644 --- a/mteb/models/instruct_wrapper.py +++ b/mteb/models/instruct_wrapper.py @@ -147,10 +147,13 @@ def encode( ) # to passage prompts won't be applied to passages - if not self.apply_instruction_to_passages and prompt_type == PromptType.passage: + if ( + not self.apply_instruction_to_passages + and prompt_type == PromptType.document + ): instruction = None logger.info( - f"No instruction used, because prompt type = {prompt_type.passage}" + f"No instruction used, because prompt type = {prompt_type.document}" ) if instruction: diff --git a/mteb/models/jasper_models.py b/mteb/models/jasper_models.py index b3a3f62516..cfdc3849df 100644 --- a/mteb/models/jasper_models.py +++ b/mteb/models/jasper_models.py @@ -44,7 +44,7 @@ def encode( instruction = self.get_task_instruction(task_name, prompt_type) # to passage prompts won't be applied to passages - if prompt_type == PromptType.passage and task.metadata.category == "s2p": + if prompt_type == PromptType.document and task.metadata.category == "s2p": instruction = None embeddings = self.model.encode( diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index dea802ab03..d63522f1af 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -7,12 +7,15 @@ import numpy as np import torch +from PIL import Image from sentence_transformers import __version__ as st_version +from transformers import AutoModel from mteb.encoder_interface import PromptType from mteb.languages import PROGRAMMING_LANGS from mteb.model_meta import ModelMeta from mteb.models.sentence_transformer_wrapper import SentenceTransformerWrapper +from mteb.models.wrapper import Wrapper from mteb.requires_package import requires_package logger = logging.getLogger(__name__) @@ -213,24 +216,27 @@ def encode( return embeddings -class JinaV4Wrapper(SentenceTransformerWrapper): +class JinaV4Wrapper(Wrapper): """following the hf model card documentation.""" - jina_task_to_prompt = { - "retrieval.query": "Query: ", - "retrieval.passage": "Passage: ", - "text-matching": "Query: ", - } + SUPPORTED_VECTOR_TYPES = {"single_vector", "multi_vector"} def __init__( self, model: str, revision: str | None = None, + device_map="cuda", + torch_dtype=torch.bfloat16, + attn_implementation="flash_attention_2", + trust_remote_code: bool = True, model_prompts: dict[str, str] | None = None, **kwargs, ) -> None: requires_package( - self, "flash_attn", model, "pip install 'mteb[flash_attention]'" + self, + "flash_attn", + model, + "pip install 'mteb[flash_attention]'", ) requires_package(self, "peft", model, "pip install 'mteb[jina-v4]'") requires_package(self, "torchvision", model, "pip install 'mteb[jina-v4]'") @@ -238,48 +244,268 @@ def __init__( import peft # noqa: F401 import transformers # noqa: F401 - super().__init__(model, revision, model_prompts, **kwargs) + self.model = AutoModel.from_pretrained( + model, + device_map=device_map, + trust_remote_code=trust_remote_code, + torch_dtype=torch_dtype, + attn_implementation=attn_implementation, + revision=revision, + ).eval() + self.model_prompts = model_prompts or {} + self.vector_type = "single_vector" # default vector type + + def _resolve_task_parameters( + self, task_name: str | None, prompt_type: PromptType | None = None + ) -> tuple[str, str, str]: + """Resolve task parameters from task_name and prompt_type. + + Returns: + tuple: (base_task, prompt_name_param, task_type) + """ + task_type = self.get_prompt_name(self.model_prompts, task_name, prompt_type) + jina_task_name = self.model_prompts.get(task_type) if task_type else None + + # Determine prompt name parameter + if jina_task_name and "query" in jina_task_name: + prompt_name_param = "query" + elif jina_task_name and "passage" in jina_task_name: + prompt_name_param = "passage" + else: + prompt_name_param = "query" # default fallback + + jina_task_name = get_programming_task_override(task_name, jina_task_name) + # Extract base task (e.g., "retrieval" from "retrieval.query") + base_task = jina_task_name.split(".")[0] if jina_task_name else "retrieval" + + return base_task, prompt_name_param, task_type + + @staticmethod + def _log_task_info( + task_name: str, + prompt_type: PromptType | None, + prompt_name: str | None, + sentences_count: int, + ) -> None: + """Log task and prompt information.""" + if prompt_name: + logger.info(f"Using {prompt_name=} for {task_name=} {prompt_type=}") + else: + logger.info(f"No model prompts found for {task_name=} {prompt_type=}") + logger.info(f"Encoding {sentences_count} sentences.") def encode( self, sentences: Sequence[str], *, task_name: str, + batch_size: int = 32, prompt_type: PromptType | None = None, **kwargs: Any, ) -> np.ndarray: prompt_name = self.get_prompt_name(self.model_prompts, task_name, prompt_type) - if prompt_name: - logger.info( - f"Using prompt_name={prompt_name} for task={task_name} prompt_type={prompt_type}" - ) + self._log_task_info(task_name, prompt_type, prompt_name, len(sentences)) + + embeddings = self.get_text_embeddings( + texts=list(sentences), + task_name=task_name, + batch_size=batch_size, + prompt_type=prompt_type, + return_numpy=True, + **kwargs, + ) + + return embeddings + + def get_text_embeddings( + self, + texts: list[str], + *, + task_name: str | None = None, + prompt_type: PromptType | None = None, + batch_size: int = 32, + return_numpy=False, + **kwargs: Any, + ): + # Resolve task parameters + base_task, prompt_name_param, task_type = self._resolve_task_parameters( + task_name, prompt_type + ) + + if task_type.startswith("DocumentUnderstanding"): + self.vector_type = "multi_vector" else: - logger.info( - f"No model prompts found for task={task_name} prompt_type={prompt_type}" + self.vector_type = "single_vector" + + with torch.no_grad(): + return self.model.encode_text( + texts=texts, + batch_size=batch_size, + return_multivector=self.vector_type == "multi_vector", + prompt_name=prompt_name_param, + task=base_task, + return_numpy=return_numpy, ) - logger.info(f"Encoding {len(sentences)} sentences.") - # Get Jina-specific parameters - jina_task_name = self.model_prompts.get(prompt_name) if prompt_name else None - jina_prompt = ( - self.jina_task_to_prompt.get(jina_task_name) if jina_task_name else None - ) + def get_image_embeddings( + self, + images, + *, + task_name: str | None = None, + prompt_type: PromptType | None = None, + batch_size: int = 32, + max_pixels: int = 37788800, + return_numpy=False, + **kwargs: Any, + ): + import torchvision.transforms.functional as F + from torch.utils.data import DataLoader - # Override task for programming-related content - jina_task_name = get_programming_task_override(task_name, jina_task_name) + # Resolve task parameters + base_task, _, task_type = self._resolve_task_parameters(task_name, prompt_type) - embeddings = self.model.encode( - sentences, - task=jina_task_name.split(".")[0] if jina_task_name else None, - prompt=jina_prompt, - **kwargs, + if task_type.startswith("DocumentUnderstanding"): + self.vector_type = "multi_vector" + else: + self.vector_type = "single_vector" + + all_images = [] + if isinstance(images, DataLoader): + iterator = images + else: + iterator = DataLoader(images, batch_size=batch_size) + + for batch in iterator: + for b in batch: + pil_img = ( + F.to_pil_image(b.to("cpu")) if not isinstance(b, Image.Image) else b + ) + all_images.append(pil_img) + + batch_size = 1 + return self.model.encode_image( + images=all_images, + batch_size=batch_size, + max_pixels=max_pixels, + return_multivector=self.vector_type == "multi_vector", + task=base_task, + return_numpy=return_numpy, ) - if isinstance(embeddings, torch.Tensor): - # sometimes in kwargs can be return_tensors=True - embeddings = embeddings.cpu().detach().float().numpy() + def get_fused_embeddings( + self, + *args, + **kwargs, + ): + raise NotImplementedError( + "Fused embeddings are not supported yet. Please use get_text_embeddings or get_image_embeddings." + ) + + @staticmethod + def _convert_to_torch_if_needed(embeddings): + """Convert numpy arrays to torch tensors if needed.""" + if isinstance(embeddings, np.ndarray): + return torch.from_numpy(embeddings) + elif isinstance(embeddings, list): + # Handle list of numpy arrays or tensors + converted = [] + for emb in embeddings: + if isinstance(emb, np.ndarray): + converted.append(torch.from_numpy(emb)) + else: + converted.append(emb) + return converted return embeddings + def similarity(self, a, b): + """Compute similarity between embeddings. + + Args: + a: First embedding + b: Second embedding + """ + a_torch = self._convert_to_torch_if_needed(a) + b_torch = self._convert_to_torch_if_needed(b) + + if self.vector_type == "single_vector": + return self.score_single_vector(a_torch, b_torch) + elif self.vector_type == "multi_vector": + return self.score_multi_vector(a_torch, b_torch) + else: + raise ValueError( + "vector_type must be one of the following: [`single_vector`, `multi_vector`]" + ) + + @staticmethod + def score_single_vector( + qs: torch.Tensor | list[torch.Tensor], + ps: torch.Tensor | list[torch.Tensor], + ) -> torch.Tensor: + """Compute the dot product score for the given single-vector query and passage embeddings.""" + device = "cpu" + + if len(qs) == 0: + raise ValueError("No queries provided") + if len(ps) == 0: + raise ValueError("No passages provided") + + # Normalize inputs to 2D tensors + def normalize_input(x): + if isinstance(x, torch.Tensor): + return x.unsqueeze(0) if x.ndim == 1 else x + else: # list + return torch.stack(x) if len(x) > 1 else x[0].unsqueeze(0) + + qs_stacked = normalize_input(qs).to(device) + ps_stacked = normalize_input(ps).to(device) + + # Compute scores + scores = torch.einsum("bd,cd->bc", qs_stacked, ps_stacked).to(torch.float32) + + # Squeeze if single query + return scores.squeeze(0) if scores.shape[0] == 1 else scores + + @staticmethod + def score_multi_vector( + qs: list[torch.Tensor], + ps: list[torch.Tensor], + batch_size: int = 16, + ) -> torch.Tensor: + """Compute the MaxSim score (ColBERT-like) for the given multi-vector query and passage embeddings.""" + device = "cpu" + + if len(qs) == 0: + raise ValueError("No queries provided") + if len(ps) == 0: + raise ValueError("No passages provided") + + scores_list: list[torch.Tensor] = [] + + for i in range(0, len(qs), batch_size): + scores_batch = [] + qs_batch = torch.nn.utils.rnn.pad_sequence( + qs[i : i + batch_size], batch_first=True, padding_value=0 + ).to(device) + for j in range(0, len(ps), batch_size): + ps_batch = torch.nn.utils.rnn.pad_sequence( + ps[j : j + batch_size], batch_first=True, padding_value=0 + ).to(device) + scores_batch.append( + torch.einsum("bnd,csd->bcns", qs_batch, ps_batch) + .max(dim=3)[0] + .sum(dim=2) + ) + scores_batch = torch.cat(scores_batch, dim=1).cpu() + scores_list.append(scores_batch) + + scores = torch.cat(scores_list, dim=0) + assert scores.shape[0] == len(qs), ( + f"Expected {len(qs)} scores, got {scores.shape[0]}" + ) + + scores = scores.to(torch.float32) + return scores + def get_programming_task_override( task_name: str, current_task_name: str | None @@ -317,22 +543,24 @@ def get_programming_task_override( loader=partial( # type: ignore JinaV4Wrapper, model="jinaai/jina-embeddings-v4", - revision="26239889730c735ed7e9a4db9180c8935faf4ba0", + revision="4a58ca57710c49f51896e4bc820e202fbf64904b", trust_remote_code=True, model_prompts={ "Retrieval-query": "retrieval.query", - "Retrieval-passage": "retrieval.passage", + "Retrieval-document": "retrieval.passage", "STS": "text-matching", + "DocumentUnderstanding": "retrieval.query", }, ), name="jinaai/jina-embeddings-v4", languages=XLMR_LANGUAGES, open_weights=True, - revision="26239889730c735ed7e9a4db9180c8935faf4ba0", + revision="4a58ca57710c49f51896e4bc820e202fbf64904b", release_date="2025-06-24", # official release date + modalities=["image", "text"], n_parameters=int(3.8 * 1e9), memory_usage_mb=7500, - max_tokens=8194, + max_tokens=32768, embed_dim=2048, license="cc-by-nc-4.0", similarity_fn_name="cosine", @@ -354,7 +582,7 @@ def get_programming_task_override( trust_remote_code=True, model_prompts={ "Retrieval-query": "retrieval.query", - "Retrieval-passage": "retrieval.passage", + "Retrieval-document": "retrieval.passage", "Clustering": "separation", "Classification": "classification", "STS": "text-matching", @@ -405,7 +633,7 @@ def get_programming_task_override( jina_embeddings_v2_base_en = ModelMeta( loader=partial( SentenceTransformerWrapper, - model_name="jinaai/jina-embeddings-v2-base-en", + model="jinaai/jina-embeddings-v2-base-en", revision="6e85f575bc273f1fd840a658067d0157933c83f0", trust_remote_code=True, ), @@ -464,7 +692,7 @@ def get_programming_task_override( jina_embeddings_v2_small_en = ModelMeta( loader=partial( SentenceTransformerWrapper, - model_name="jinaai/jina-embeddings-v2-small-en", + model="jinaai/jina-embeddings-v2-small-en", revision="44e7d1d6caec8c883c2d4b207588504d519788d0", trust_remote_code=True, ), @@ -523,7 +751,7 @@ def get_programming_task_override( jina_embedding_b_en_v1 = ModelMeta( loader=partial( SentenceTransformerWrapper, - model_name="jinaai/jina-embedding-b-en-v1", + model="jinaai/jina-embedding-b-en-v1", revision="32aa658e5ceb90793454d22a57d8e3a14e699516", ), name="jinaai/jina-embedding-b-en-v1", @@ -577,7 +805,7 @@ def get_programming_task_override( jina_embedding_s_en_v1 = ModelMeta( loader=partial( SentenceTransformerWrapper, - model_name="jinaai/jina-embedding-s-en-v1", + model="jinaai/jina-embedding-s-en-v1", revision="5ac6cd473e2324c6d5f9e558a6a9f65abb57143e", ), name="jinaai/jina-embedding-s-en-v1", diff --git a/mteb/models/kalm_models.py b/mteb/models/kalm_models.py index 19be60b7d0..7ea133a84c 100644 --- a/mteb/models/kalm_models.py +++ b/mteb/models/kalm_models.py @@ -38,10 +38,13 @@ def encode( task = get_task(task_name) # to passage prompts won't be applied to passages - if not self.apply_instruction_to_passages and prompt_type == PromptType.passage: + if ( + not self.apply_instruction_to_passages + and prompt_type == PromptType.document + ): instruction = None logger.info( - f"No instruction used, because prompt type = {prompt_type.passage}" + f"No instruction used, because prompt type = {prompt_type.document}" ) if task.metadata.type in ["STS", "PairClassification", "Summarization"]: @@ -167,6 +170,118 @@ def encode( "MTOPIntentClassification": ["train"], } + +kalm_v2_training_data = { + # from technical report + # not in MTEB: + # ExpertQA + # MEDI2BGE + # OpenOrca + # PAQ + # PubMedQA + # SearchQA + # arxiv_qa + # rag-dataset-12000 + # CC-News + # SQuAD 2.0 + # TriviaQA + # WebGPT Comparisons + # MultiNLI + # NLLB + # WikiAnswers + # SimCSE NLI + # SNLI + # Aya Dataset + # eli5 + # ---- + # in MTEB: + "CodeFeedbackMT": ["train"], + "CodeFeedbackST": ["train"], + "ArxivClusteringP2P": ["train"], + "ArxivClusteringS2S": ["train"], + "ArxivClusteringP2P.v2": ["train"], + "TRECCOVID": ["train"], + "DBPedia": ["train"], + "ESCIReranking": ["train"], + "FEVER": ["train"], + "FiQA2018": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "FEVER-NL": ["train"], # translation not trained on + "FiQA2018-NL": ["train"], # translation not trained on + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQA-NL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "MultiLongDocRetrieval": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "mMARCO-NL": ["train"], # translation not trained on + "MSMARCOv2": ["train"], + "NFCorpus": ["train"], + "SciFact": ["train"], + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "NQ-NL": ["train"], # translation not trained on + "YahooAnswersTopicsClassification": ["train"], + "ContractNLIConfidentialityOfAgreementLegalBenchClassification": ["train"], + "ContractNLIExplicitIdentificationLegalBenchClassification": ["train"], + "ContractNLIInclusionOfVerballyConveyedInformationLegalBenchClassification": [ + "train" + ], + "ContractNLILimitedUseLegalBenchClassification": ["train"], + "ContractNLINoLicensingLegalBenchClassification": ["train"], + "ContractNLINoticeOnCompelledDisclosureLegalBenchClassification": ["train"], + "ContractNLIPermissibleAcquirementOfSimilarInformationLegalBenchClassification": [ + "train" + ], + "ContractNLIPermissibleCopyLegalBenchClassification": ["train"], + "ContractNLIPermissibleDevelopmentOfSimilarInformationLegalBenchClassification": [ + "train" + ], + "ContractNLIPermissiblePostAgreementPossessionLegalBenchClassification": ["train"], + "ContractNLIReturnOfConfidentialInformationLegalBenchClassification": ["train"], + "ContractNLISharingWithEmployeesLegalBenchClassification": ["train"], + "ContractNLISharingWithThirdPartiesLegalBenchClassification": ["train"], + "ContractNLISurvivalOfObligationsLegalBenchClassification": ["train"], + "QuoraRetrieval": ["train"], + "NanoQuoraRetrieval": ["train"], + "BiorxivClusteringP2P.v2": ["train"], + "BiorxivClusteringS2S.v2": ["train"], + "MedrxivClusteringP2P.v2": ["train"], + "MedrxivClusteringS2S.v2": ["train"], + "Banking77Classification": ["train"], + "AmazonPolarityClassification": ["train"], + "ImdbClassification": ["train"], + "EmotionClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "MrTidyRetrieval": ["train"], + "PawsXPairClassification": ["train"], + "AmazonReviewsClassification": ["train"], + "AmazonCounterfactualClassification": ["train"], + "MultilingualSentiment": ["train"], + "MassiveIntentClassification": ["train"], + "MassiveScenarioClassification": ["train"], + "MTOPDomainClassification": ["train"], + "MTOPIntentClassification": ["train"], + "Reddit-Clustering": ["train"], + "Reddit-Clustering-P2P": ["train"], + "Stackexchange-Clustering": ["train"], + "Stackexchange-Clustering-P2P": ["train"], + "TwentyNewsgroups-Clustering": ["train"], + "ATEC": ["train"], + "BQ": ["train"], + "CQADupstack": ["train"], +} + + KaLM_task_prompts = { "AmazonCounterfactualClassification": "Given an Amazon review, judge whether it is counterfactual.", "AmazonPolarityClassification": "Classifying Amazon reviews into positive or negative sentiment", @@ -229,6 +344,146 @@ def encode( "RuSciBenchOECDClusteringP2P": "Identify the topic or theme of the Russian articles.", } +KaLM_v2_task_prompts = { + "AmazonCounterfactualClassification": "Given an Amazon review, judge whether it is counterfactual.", + "AmazonPolarityClassification": "Classifying Amazon reviews into positive or negative sentiment", + "AmazonReviewsClassification": "Classifying the given Amazon review into its appropriate rating category", + "Banking77Classification": "Given an online banking query, find the corresponding intents", + "EmotionClassification": "Classifying the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise", + "ImdbClassification": "Classifying the sentiment expressed in the given movie review text from the IMDB dataset", + "MassiveIntentClassification": "Given a user utterance as query, find the user intents", + "MassiveScenarioClassification": "Given a user utterance as query, find the user scenarios", + "MTOPDomainClassification": "Classifying the intent domain of the given utterance in task-oriented conversation", + "MTOPIntentClassification": "Classifying the intent of the given utterance in task-oriented conversation", + "ToxicConversationsClassification": "Classifying the given comments as either toxic or not toxic", + "TweetSentimentExtractionClassification": "Classifying the sentiment of a given tweet as either positive, negative, or neutral", + "TNews": "Categorizing the given news title", + "IFlyTek": "Given an App description text, find the appropriate fine-grained category", + "MultilingualSentiment": "Classifying sentiment of the customer review into positive, neutral, or negative", + "JDReview": "Classifying sentiment of the customer review for iPhone into positive or negative", + "OnlineShopping": "Classifying sentiment of the customer review into positive or negative", + "Waimai": "Classify the customer review from a food takeaway platform into positive or negative", + "ArxivClusteringP2P": "Identify the main and secondary category of Arxiv papers based on the titles and abstracts", + "ArxivClusteringS2S": "Identify the main and secondary category of Arxiv papers based on the titles", + "BiorxivClusteringP2P": "Identify the main category of Biorxiv papers based on the titles and abstracts", + "BiorxivClusteringS2S": "Identify the main category of Biorxiv papers based on the titles", + "MedrxivClusteringP2P": "Identify the main category of Medrxiv papers based on the titles and abstracts", + "MedrxivClusteringS2S": "Identify the main category of Medrxiv papers based on the titles", + "RedditClustering": "Identify the topic or theme of Reddit posts based on the titles", + "RedditClusteringP2P": "Identify the topic or theme of Reddit posts based on the titles and posts", + "StackExchangeClustering": "Identify the topic or theme of StackExchange posts based on the titles", + "StackExchangeClusteringP2P": "Identify the topic or theme of StackExchange posts based on the given paragraphs", + "TwentyNewsgroupsClustering": "Identify the topic or theme of the given news articles", + "CLSClusteringS2S": "Identify the main category of scholar papers based on the titles", + "CLSClusteringP2P": "Identify the main category of scholar papers based on the titles and abstracts", + "ThuNewsClusteringS2S": "Identify the topic or theme of the given news articles based on the titles", + "ThuNewsClusteringP2P": "Identify the topic or theme of the given news articles based on the titles and contents", + "Cmnli-query": "Retrieve semantically similar text", + "Cmnli-document": "Retrieve semantically similar text", + "Ocnli-query": "Retrieve semantically similar text", + "Ocnli-document": "Retrieve semantically similar text", + "SprintDuplicateQuestions-query": "Retrieve semantically similar questions", + "SprintDuplicateQuestions-document": "Retrieve semantically similar questions", + "TwitterSemEval2015-query": "Retrieve semantically similar text", + "TwitterSemEval2015-document": "Retrieve semantically similar text", + "TwitterURLCorpus-query": "Retrieve semantically similar text", + "TwitterURLCorpus-document": "Retrieve semantically similar text", + "CMedQAv1-reranking": "Given a query, retrieve documents that answer the query", + "CMedQAv2-reranking": "Given a query, retrieve documents that answer the query", + "MMarcoReranking": "Given a query, retrieve documents that answer the query", + "T2Reranking": "Given a query, retrieve documents that answer the query", + "AskUbuntuDupQuestions-query": "Retrieve semantically similar questions", + "AskUbuntuDupQuestions-document": "Retrieve semantically similar questions", + "MindSmallReranking": "Given a query, retrieve documents that answer the query", + "SciDocsRR-query": "Retrieve relevant paper titles", + "SciDocsRR-document": "Retrieve relevant paper titles", + "StackOverflowDupQuestions-query": "Retrieve semantically similar questions", + "StackOverflowDupQuestions-document": "Retrieve semantically similar questions", + "CmedqaRetrieval": "Given a query, retrieve documents that answer the query", + "CovidRetrieval": "Given a query, retrieve documents that answer the query", + "DuRetrieval": "Given a query, retrieve documents that answer the query", + "EcomRetrieval": "Given a query, retrieve documents that answer the query", + "MedicalRetrieval": "Given a query, retrieve documents that answer the query", + "MMarcoRetrieval": "Given a query, retrieve documents that answer the query", + "T2Retrieval": "Given a query, retrieve documents that answer the query", + "VideoRetrieval": "Given a query, retrieve documents that answer the query", + "MSMARCO": "Given a query, retrieve documents that answer the query", + "ArguAna": "Given a query, retrieve documents that answer the query", + "ClimateFEVER": "Given a query, retrieve documents that answer the query", + "CQADupstackAndroidRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackAndroidRetrieval-document": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackEnglishRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackEnglishRetrieval-document": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackGamingRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackGamingRetrieval-document": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackGisRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackGisRetrieval-document": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackMathematicaRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackMathematicaRetrieval-document": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackPhysicsRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackPhysicsRetrieval-document": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackProgrammersRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackProgrammersRetrieval-document": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackStatsRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackStatsRetrieval-document": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackTexRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackTexRetrieval-document": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackUnixRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackUnixRetrieval-document": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackWebmastersRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackWebmastersRetrieval-document": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackWordpressRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackWordpressRetrieval-document": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "DBPedia": "Given a query, retrieve documents that answer the query", + "FEVER": "Given a query, retrieve documents that answer the query", + "FiQA2018": "Given a query, retrieve documents that answer the query", + "HotpotQA": "Given a query, retrieve documents that answer the query", + "NFCorpus": "Given a query, retrieve documents that answer the query", + "NQ": "Given a query, retrieve documents that answer the query", + "QuoraRetrieval-query": "Retrieve semantically similar questions", + "QuoraRetrieval-document": "Retrieve semantically similar questions", + "SCIDOCS-query": "Given a query, retrieve documents that answer the query", + "SCIDOCS-document": "Given a query, retrieve documents that answer the query", + "SciFact": "Given a query, retrieve documents that answer the query", + "Touche2020": "Given a query, retrieve documents that answer the query", + "TRECCOVID": "Given a query, retrieve documents that answer the query", + "AFQMC-query": "Retrieve semantically similar text", + "AFQMC-document": "Retrieve semantically similar text", + "ATEC-query": "Retrieve semantically similar text", + "ATEC-document": "Retrieve semantically similar text", + "BQ-query": "Retrieve semantically similar text", + "BQ-document": "Retrieve semantically similar text", + "LCQMC-query": "Retrieve semantically similar text", + "LCQMC-document": "Retrieve semantically similar text", + "PAWSX-query": "Retrieve semantically similar text", + "PAWSX-document": "Retrieve semantically similar text", + "QBQTC-query": "Retrieve semantically similar text", + "QBQTC-document": "Retrieve semantically similar text", + "STSB-query": "Retrieve semantically similar text", + "STSB-document": "Retrieve semantically similar text", + "BIOSSES-query": "Retrieve semantically similar text", + "BIOSSES-document": "Retrieve semantically similar text", + "SICK-R-query": "Retrieve semantically similar text", + "SICK-R-document": "Retrieve semantically similar text", + "STS12-query": "Retrieve semantically similar text", + "STS12-document": "Retrieve semantically similar text", + "STS13-query": "Retrieve semantically similar text", + "STS13-document": "Retrieve semantically similar text", + "STS14-query": "Retrieve semantically similar text", + "STS14-document": "Retrieve semantically similar text", + "STS15-query": "Retrieve semantically similar text", + "STS15-document": "Retrieve semantically similar text", + "STS16-query": "Retrieve semantically similar text", + "STS16-document": "Retrieve semantically similar text", + "STS17-query": "Retrieve semantically similar text", + "STS17-document": "Retrieve semantically similar text", + "STS22-query": "Retrieve semantically similar text", + "STS22-document": "Retrieve semantically similar text", + "STSBenchmark-query": "Retrieve semantically similar text", + "STSBenchmark-document": "Retrieve semantically similar text", + "SummEval-query": "Retrieve semantically similar summaries", + "SummEval-document": "Retrieve semantically similar summaries", +} KaLM_X_task_prompts = { "Classification": "classify the query into different classes.", @@ -279,7 +534,7 @@ def encode( "CMedQAv1-reranking-query": "Given a Chinese community medical question, retrieve replies that best answer the question", "CMedQAv2-reranking-query": "Given a Chinese community medical question, retrieve replies that best answer the question", "ArguAna-query": "Given a claim, find documents that refute the claim", - "ArguAna-passage": "Given a claim, find documents that refute the claim", + "ArguAna-document": "Given a claim, find documents that refute the claim", "ClimateFEVER-query": "Given a claim about climate change, retrieve documents that support or refute the claim", "ClimateFEVERHardNegatives-query": "Given a claim about climate change, retrieve documents that support or refute the claim", "DBPedia-query": "Given a query, retrieve relevant entity descriptions from DBPedia", @@ -426,9 +681,9 @@ def encode( "MIRACLRetrievalHardNegatives-query": "Retrieval relevant passage for the given query", "CQADupstackRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", "CQADupstackGamingRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", - "CQADupstackGamingRetrieval-passage": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackGamingRetrieval-document": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", "CQADupstackUnixRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", - "CQADupstackUnixRetrieval-passage": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackUnixRetrieval-document": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", } KaLM_INSTRUCTION = "Instruct: {instruction} \n Query: " @@ -517,6 +772,37 @@ def encode( superseded_by=None, ) +HIT_TMG__KaLM_embedding_multilingual_mini_instruct_v2 = ModelMeta( + loader=partial( # type: ignore + InstructSentenceTransformerWrapper, + model_name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v2", + revision="d2a21c232dc712ae8230af56d1027cf21b7864bf", + instruction_template=KaLM_INSTRUCTION, + max_seq_length=512, + apply_instruction_to_passages=False, + prompts_dict=KaLM_v2_task_prompts, + ), + name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v2", + revision="d2a21c232dc712ae8230af56d1027cf21b7864bf", + release_date="2025-06-25", + languages=["eng-Latn", "zho-Hans"], + n_parameters=494032768, + memory_usage_mb=942, + max_tokens=512, + embed_dim=896, + license="mit", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v2", + similarity_fn_name="cosine", + use_instructions=True, + training_datasets=kalm_v2_training_data, + adapted_from="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5", + superseded_by=None, +) + # KaLM_Embedding_X_0605 = ModelMeta( # loader=partial( diff --git a/mteb/models/mcinext_models.py b/mteb/models/mcinext_models.py index a5f02279ca..ce70a4bbb9 100644 --- a/mteb/models/mcinext_models.py +++ b/mteb/models/mcinext_models.py @@ -256,7 +256,7 @@ def _preprocess_sample( if task_id == 3: if sub == "sentence1" or (prompt_type and prompt_type.value == "query"): return f"{task_prompt} | متن اول : {sample}" - if sub == "sentence2" or (prompt_type and prompt_type.value == "passage"): + if sub == "sentence2" or (prompt_type and prompt_type.value == "document"): return f"{task_prompt} | متن دوم : {sample}" return sample diff --git a/mteb/models/nomic_models.py b/mteb/models/nomic_models.py index f02157aa09..78d7163ef8 100644 --- a/mteb/models/nomic_models.py +++ b/mteb/models/nomic_models.py @@ -56,7 +56,7 @@ def encode( # type: ignore # default to search_document if input_type and prompt_name are not provided prompt_name = ( self.get_prompt_name(self.model_prompts, task_name, prompt_type) - or PromptType.passage.value + or PromptType.document.value ) task = mteb.get_task(task_name) # normalization not applied to classification @@ -177,7 +177,7 @@ def encode( # type: ignore "STS": "classification: ", "Summarization": "classification: ", PromptType.query.value: "search_query: ", - PromptType.passage.value: "search_document: ", + PromptType.document.value: "search_document: ", } nomic_embed_v1_5 = ModelMeta( diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index 6d3ce18aa2..72842e8190 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -224,3 +224,57 @@ def _to_numpy(self, embedding_response) -> np.ndarray: license=None, similarity_fn_name=None, ) + +text_embedding_3_small_512 = ModelMeta( + name="openai/text-embedding-3-small (embed_dim=512)", + revision="3", + release_date="2024-01-25", + languages=None, # supported languages not specified + loader=partial( # type: ignore + OpenAIWrapper, + model_name="text-embedding-3-small", + tokenizer_name="cl100k_base", + max_tokens=8191, + embed_dim=512, + ), + max_tokens=8191, + embed_dim=512, + open_weights=False, + n_parameters=None, + memory_usage_mb=None, + license=None, + reference="https://openai.com/index/new-embedding-models-and-api-updates/", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=False, + public_training_code=None, + public_training_data=None, # assumed + training_datasets=None, +) + +text_embedding_3_large_512 = ModelMeta( + name="openai/text-embedding-3-large (embed_dim=512)", + revision="3", + release_date="2024-01-25", + languages=None, # supported languages not specified + loader=partial( # type: ignore + OpenAIWrapper, + model_name="text-embedding-3-large", + tokenizer_name="cl100k_base", + max_tokens=8191, + embed_dim=512, + ), + max_tokens=8191, + embed_dim=512, + open_weights=False, + reference="https://openai.com/index/new-embedding-models-and-api-updates/", + framework=["API"], + use_instructions=False, + n_parameters=None, + memory_usage_mb=None, + public_training_code=None, + public_training_data=None, # assumed + training_datasets=None, + license=None, + similarity_fn_name=None, +) diff --git a/mteb/models/openclip_models.py b/mteb/models/openclip_models.py index f0376e1bca..2af6cf45fc 100644 --- a/mteb/models/openclip_models.py +++ b/mteb/models/openclip_models.py @@ -17,7 +17,7 @@ def openclip_loader(**kwargs): model_name = kwargs.get("model_name", "CLIP-ViT") requires_package( openclip_loader, - "open_clip_torch", + "open_clip", model_name, "pip install 'mteb[open_clip_torch]'", ) diff --git a/mteb/models/opensearch_neural_sparse_models.py b/mteb/models/opensearch_neural_sparse_models.py new file mode 100644 index 0000000000..0ed6f4915c --- /dev/null +++ b/mteb/models/opensearch_neural_sparse_models.py @@ -0,0 +1,248 @@ +from __future__ import annotations + +from collections.abc import Sequence +from functools import partial + +import sentence_transformers +import torch + +from mteb.encoder_interface import PromptType +from mteb.model_meta import ModelMeta +from mteb.models.wrapper import Wrapper + +v2_training_data = { + "MSMARCO": ["train"], + # not in MTEB. see https://huggingface.co/datasets/sentence-transformers/embedding-training-data + # "eli5_question_answer": ["train"], + # "gooaq_pairs": ["train"], + # "searchQA_top5_snippets": ["train"], + # "squad_pairs": ["train"], + # "stackexchange_duplicate_questions_body_body": ["train"], + # "stackexchange_duplicate_questions_title_title": ["train"], + # "stackexchange_duplicate_questions_title-body_title-body": ["train"], + # "WikiAnswers": ["train"], + # "wikihow": ["train"], + # "yahoo_answers_question_answer": ["train"], + # "yahoo_answers_title_answer": ["train"], + # "yahoo_answers_title_question": ["train"], +} + + +v3_training_data = v2_training_data | { + "HotpotQA": ["train"], + "FEVER": ["train"], + "FIQA": ["train"], + "NFCORPUS": ["train"], + "SCIFACT": ["train"], + # not in MTEB. see https://huggingface.co/datasets/sentence-transformers/embedding-training-data + # "NQ-train_pairs": ["train"], + # "quora_duplicates": ["train"], +} + + +class SparseEncoderWrapper(Wrapper): + def __init__( + self, + model_name: str, + torch_dtype: torch.dtype = torch.float16, + **kwargs, + ): + if sentence_transformers.__version__ < "5.0.0": + raise ImportError( + "sentence-transformers version must be >= 5.0.0 to load sparse encoder" + ) + from sentence_transformers.sparse_encoder import SparseEncoder + + self.model_name = model_name + self.kwargs = kwargs + self.model = SparseEncoder(model_name, **kwargs) + self.model.to(torch_dtype) + self.batch_size = kwargs.get("batch_size", 1000) + + def similarity( + self, query_embeddings: torch.Tensor, corpus_embeddings: torch.Tensor + ) -> torch.Tensor: + """Compute similarity between sparse query_embeddings and corpus_embeddings in batches. + + Args: + query_embeddings: sparse COO tensor of shape (num_queries, dim) + corpus_embeddings: tensor of shape (num_corpus, dim) + + Returns: + Similarity matrix of shape (num_queries, num_corpus) + """ + sims = [] + num_queries = query_embeddings.size(0) + batch_size = self.batch_size + + # Ensure query_embeddings is coalesced sparse COO + q = query_embeddings.coalesce() + indices = q.indices() # 2 x nnz: [row, col] + values = q.values() # nnz + n_cols = q.size(1) + + # Iterate over sparse query embeddings in batches + for start in range(0, num_queries, batch_size): + end = min(start + batch_size, num_queries) + # Select non-zero entries for this batch + mask = (indices[0] >= start) & (indices[0] < end) + sel_idx = indices[:, mask].clone() + sel_idx[0] -= start # shift row indices to batch-local + sel_vals = values[mask].clone() + + # Build sparse batch tensor of shape (batch_rows, dim) + batch_q = torch.sparse_coo_tensor( + sel_idx, + sel_vals, + size=(end - start, n_cols), + device=q.device, + dtype=q.dtype, + ).coalesce() + + # Compute similarity for this sparse batch + sim_batch = self.model.similarity(batch_q, corpus_embeddings) + sims.append(sim_batch) + + # Concatenate all batch results + return torch.cat(sims, dim=0) + + def encode( + self, sentences: Sequence[str], prompt_type: PromptType | None = None, **kwargs + ): + if prompt_type is not None and prompt_type == PromptType.query: + return self.model.encode_query( + sentences, # type: ignore[arg-type] + **kwargs, + ) + return self.model.encode_document(sentences, **kwargs) # type: ignore[return-value] + + +opensearch_neural_sparse_encoding_doc_v3_gte = ModelMeta( + name="opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte", + languages=["eng-Latn"], + open_weights=True, + revision="a8abaa916125ee512a7a8f4d706d07eb0128a8e6", + release_date="2025-06-18", + n_parameters=137_394_234, + memory_usage_mb=549, + embed_dim=30522, + license="apache-2.0", + max_tokens=8192, + reference="https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte", + similarity_fn_name="dot", + framework=["Sentence Transformers", "PyTorch"], + public_training_code="https://github.com/zhichao-aws/opensearch-sparse-model-tuning-sample", + public_training_data=True, + use_instructions=True, + training_datasets=v3_training_data, + loader=partial( # type: ignore[call-arg] + SparseEncoderWrapper, + model_name="opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte", + trust_remote_code=True, + revision="a8abaa916125ee512a7a8f4d706d07eb0128a8e6", + ), +) + + +opensearch_neural_sparse_encoding_doc_v3_distill = ModelMeta( + name="opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill", + languages=["eng-Latn"], + open_weights=True, + revision="babf71f3c48695e2e53a978208e8aba48335e3c0", + release_date="2025-03-28", + n_parameters=66_985_530, + memory_usage_mb=267, + embed_dim=30522, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill", + similarity_fn_name="dot", + framework=["Sentence Transformers", "PyTorch"], + public_training_code="https://github.com/zhichao-aws/opensearch-sparse-model-tuning-sample", + public_training_data=True, + use_instructions=True, + training_datasets=v3_training_data, + loader=partial( # type: ignore[call-arg] + SparseEncoderWrapper, + model_name="opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill", + revision="babf71f3c48695e2e53a978208e8aba48335e3c0", + ), +) + +opensearch_neural_sparse_encoding_doc_v2_distill = ModelMeta( + name="opensearch-project/opensearch-neural-sparse-encoding-doc-v2-distill", + languages=["eng-Latn"], + open_weights=True, + revision="8921a26c78b8559d6604eb1f5c0b74c079bee38f", + release_date="2024-07-17", + n_parameters=66_985_530, + memory_usage_mb=267, + embed_dim=30522, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-doc-v2-distill", + similarity_fn_name="dot", + framework=["Sentence Transformers", "PyTorch"], + public_training_code="https://github.com/zhichao-aws/opensearch-sparse-model-tuning-sample", + public_training_data=True, + use_instructions=True, + training_datasets=v2_training_data, + loader=partial( # type: ignore[call-arg] + SparseEncoderWrapper, + model_name="opensearch-project/opensearch-neural-sparse-encoding-doc-v2-distill", + revision="8921a26c78b8559d6604eb1f5c0b74c079bee38f", + ), +) + + +opensearch_neural_sparse_encoding_doc_v2_mini = ModelMeta( + name="opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini", + languages=["eng-Latn"], + open_weights=True, + revision="4af867a426867dfdd744097531046f4289a32fdd", + release_date="2024-07-18", + n_parameters=22_744_506, + memory_usage_mb=86, + embed_dim=30522, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini", + similarity_fn_name="dot", + framework=["Sentence Transformers", "PyTorch"], + public_training_code="https://github.com/zhichao-aws/opensearch-sparse-model-tuning-sample", + public_training_data=True, + use_instructions=True, + training_datasets=v2_training_data, + loader=partial( # type: ignore[call-arg] + SparseEncoderWrapper, + model_name="opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini", + revision="4af867a426867dfdd744097531046f4289a32fdd", + ), +) + +opensearch_neural_sparse_encoding_doc_v1 = ModelMeta( + name="opensearch-project/opensearch-neural-sparse-encoding-doc-v1", + languages=["eng-Latn"], + open_weights=True, + revision="98cdcbd72867c547f72f2b7b7bed9cdf9f09922d", + release_date="2024-03-07", + n_parameters=132_955_194, + memory_usage_mb=507, + embed_dim=30522, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-doc-v1", + similarity_fn_name="dot", + framework=["Sentence Transformers", "PyTorch"], + public_training_code="https://github.com/zhichao-aws/opensearch-sparse-model-tuning-sample", + public_training_data=True, + use_instructions=True, + training_datasets={ + "MSMARCO": ["train"], + }, + loader=partial( # type: ignore[call-arg] + SparseEncoderWrapper, + model_name="opensearch-project/opensearch-neural-sparse-encoding-doc-v1", + revision="98cdcbd72867c547f72f2b7b7bed9cdf9f09922d", + ), +) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index 84c554b430..150570c3cc 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -28,6 +28,7 @@ clip_models, cnn14_model, codesage_models, + codi_models, cohere_models, cohere_v, colbert_models, @@ -46,6 +47,7 @@ geogpt_models, gme_v_models, google_models, + granite_vision_embedding_models, gritlm_models, gte_models, hinvec_models, @@ -81,6 +83,7 @@ nvidia_models, openai_models, openclip_models, + opensearch_neural_sparse_models, ops_moa_models, piccolo_models, promptriever_models, @@ -88,6 +91,7 @@ qtack_models, qwen2_models, qwen3_models, + qzhou_models, relle_models, repllama_models, rerankers_custom, @@ -95,6 +99,7 @@ richinfoai_models, ru_sentence_models, salesforce_models, + samilpwc_models, seamlessm4t_models, searchmap_models, seed_1_6_embedding_models, @@ -112,6 +117,7 @@ unispeech_models, vdr_models, vggish_models, + vi_vn_models, vista_models, vlm2vec_models, voyage_models, @@ -154,6 +160,7 @@ e5_v, evaclip_models, google_models, + granite_vision_embedding_models, gritlm_models, gte_models, hinvec_models, @@ -186,6 +193,7 @@ nvidia_llama_nemoretriever_colemb, openai_models, openclip_models, + opensearch_neural_sparse_models, ops_moa_models, piccolo_models, gme_v_models, @@ -194,6 +202,7 @@ qtack_models, relle_models, qwen3_models, + qzhou_models, repllama_models, rerankers_custom, rerankers_monot5_based, @@ -203,6 +212,7 @@ ua_sentence_models, seed_1_6_embedding_models, salesforce_models, + samilpwc_models, searchmap_models, sentence_transformers_models, shuu_model, @@ -237,6 +247,8 @@ colsmol_models, geogpt_models, mcinext_models, + vi_vn_models, + codi_models, ] MODEL_REGISTRY = {} diff --git a/mteb/models/qwen3_models.py b/mteb/models/qwen3_models.py index 3571cc99c1..33862939b4 100644 --- a/mteb/models/qwen3_models.py +++ b/mteb/models/qwen3_models.py @@ -11,7 +11,7 @@ def instruction_template( instruction: str, prompt_type: PromptType | None = None ) -> str: - if not instruction or prompt_type == PromptType.passage: + if not instruction or prompt_type == PromptType.document: return "" if isinstance(instruction, dict): if prompt_type is None: diff --git a/mteb/models/qzhou_models.py b/mteb/models/qzhou_models.py new file mode 100644 index 0000000000..20e92f3d16 --- /dev/null +++ b/mteb/models/qzhou_models.py @@ -0,0 +1,80 @@ +from __future__ import annotations + +from functools import partial + +from mteb.encoder_interface import PromptType +from mteb.model_meta import ModelMeta +from mteb.models.bge_models import ( + bge_chinese_training_data, + bge_full_data, + bge_m3_training_data, +) +from mteb.models.e5_instruct import E5_MISTRAL_TRAINING_DATA +from mteb.models.instruct_wrapper import InstructSentenceTransformerWrapper + + +def instruction_template( + instruction: str, prompt_type: PromptType | None = None +) -> str: + if not instruction or prompt_type == PromptType.document: + return "" + if isinstance(instruction, dict): + if prompt_type is None: + instruction = "Given a web search query, retrieve relevant passages that answer the query" + else: + instruction = instruction[prompt_type] + return f"Instruct: {instruction}\nQuery:" + + +qzhou_training_data = { + "LCQMC": ["train"], + "PAWSX": ["train"], + "TNews": ["train"], + "Waimai": ["train"], + "ImdbClassification": ["train"], + "MassiveIntentClassification": ["train"], + "MassiveScenarioClassification": ["train"], + "MindSmallReranking": ["train"], + "STS12": ["train"], + "STS22.v2": ["train"], + "STSBenchmark": ["train"], + "ToxicConversationsClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], +} + +QZhou_Embedding = ModelMeta( + loader=partial( + InstructSentenceTransformerWrapper, + model_name="Kingsoft-LLM/QZhou-Embedding", + revision="87e18b08783729eb4b778d01d297e6ddcd7e2f18", + instruction_template=instruction_template, + apply_instruction_to_passages=False, + ), + name="Kingsoft-LLM/QZhou-Embedding", + languages=["eng-Latn", "zho-Hans"], + open_weights=True, + revision="f1e6c03ee3882e7b9fa5cec91217715272e433b8", + release_date="2025-08-24", + n_parameters=7_070_619_136, + memory_usage_mb=29070, + embed_dim=3584, + license="apache-2.0", + max_tokens=8192, + reference="https://huggingface.co/Kingsoft-LLM/QZhou-Embedding", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + public_training_code=None, + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", + training_datasets={ + **bge_m3_training_data, + **bge_chinese_training_data, + **bge_full_data, + **E5_MISTRAL_TRAINING_DATA, + **qzhou_training_data, + # Not in MTEB: + # "Shitao/MLDR": ["train"], + # "FreedomIntelligence/Huatuo26M-Lite": ["train"], + # "infgrad/retrieval_data_llm": ["train"], + }, +) diff --git a/mteb/models/repllama_models.py b/mteb/models/repllama_models.py index eed13cd71b..e704cb865a 100644 --- a/mteb/models/repllama_models.py +++ b/mteb/models/repllama_models.py @@ -121,7 +121,7 @@ def loader_inner(**kwargs: Any) -> Encoder: model_prompts = { PromptType.query.value: "query: ", - PromptType.passage.value: "passage: ", + PromptType.document.value: "passage: ", } repllama_llama2_original = ModelMeta( diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index f51f03d5f9..cab4f02d74 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -35,23 +35,23 @@ "SensitiveTopicsClassification": "Классифицируй чувствительную тему по запросу \nзапрос: ", "RuBQRetrieval": { "query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ", - "passage": "", + "document": "", }, "RuBQReranking": { "query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ", - "passage": "", + "document": "", }, "RiaNewsRetrieval": { "query": "Given a news title, retrieve relevant news article\nquery: ", - "passage": "", + "document": "", }, "MIRACLReranking": { "query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ", - "passage": "", + "document": "", }, "MIRACLRetrieval": { "query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ", - "passage": "", + "document": "", }, } @@ -156,7 +156,7 @@ sentence_transformers_loader, model_name="deepvk/USER-base", revision="436a489a2087d61aa670b3496a9915f84e46c861", - model_prompts={"query": "query: ", "passage": "passage: "}, + model_prompts={"query": "query: ", "document": "passage: "}, ), name="deepvk/USER-base", languages=["rus-Cyrl"], @@ -483,11 +483,11 @@ "PairClassification": "classification: ", "Reranking": "classification: ", f"Reranking-{PromptType.query.value}": "search_query: ", - f"Reranking-{PromptType.passage.value}": "search_document: ", + f"Reranking-{PromptType.document.value}": "search_document: ", "STS": "classification: ", "Summarization": "classification: ", PromptType.query.value: "search_query: ", - PromptType.passage.value: "search_document: ", + PromptType.document.value: "search_document: ", # Override some prompts for ruMTEB tasks "HeadlineClassification": "clustering: ", "InappropriatenessClassification": "clustering: ", @@ -549,11 +549,11 @@ "PairClassification": "paraphrase: ", "Reranking": "paraphrase: ", f"Reranking-{PromptType.query.value}": "search_query: ", - f"Reranking-{PromptType.passage.value}": "search_document: ", + f"Reranking-{PromptType.document.value}": "search_document: ", "STS": "paraphrase: ", "Summarization": "categorize: ", PromptType.query.value: "search_query: ", - PromptType.passage.value: "search_document: ", + PromptType.document.value: "search_document: ", # Override some prompts for ruMTEB tasks "CEDRClassification": "categorize_sentiment: ", "GeoreviewClassification": "categorize_sentiment: ", @@ -803,11 +803,11 @@ "PairClassification": "classification: ", "Reranking": "classification: ", f"Reranking-{PromptType.query.value}": "search_query: ", - f"Reranking-{PromptType.passage.value}": "search_document: ", + f"Reranking-{PromptType.document.value}": "search_document: ", "STS": "classification: ", "Summarization": "clustering: ", PromptType.query.value: "search_query: ", - PromptType.passage.value: "search_document: ", + PromptType.document.value: "search_document: ", } user2_small = ModelMeta( loader=partial( diff --git a/mteb/models/samilpwc_models.py b/mteb/models/samilpwc_models.py new file mode 100644 index 0000000000..ba1f8245ea --- /dev/null +++ b/mteb/models/samilpwc_models.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +from functools import partial + +from mteb.encoder_interface import PromptType +from mteb.model_meta import ModelMeta +from mteb.models.e5_models import ME5_TRAINING_DATA +from mteb.models.instruct_wrapper import InstructSentenceTransformerWrapper + +SAMILPWC_GENAI_TRAINING_DATA = { + "KorSTS": ["train"], + "KLUE-TC": ["train"], + "KLUE-STS": ["train"], + "KoTripletQA": ["train"], + "MIRACL": ["train"], + "KorNLI": ["train"], +} + +INSTRUCTION = "Instruct: {instruction}\nQuery: " + + +def instruction_template( + instruction: str, prompt_type: PromptType | None = None +) -> str: + if not instruction or prompt_type == PromptType.document: + return "" + if isinstance(instruction, dict): + if prompt_type is None: + instruction = list(instruction.values())[0] + else: + instruction = instruction[prompt_type] + return INSTRUCTION.format(instruction=instruction) + + +def instruct_loader(model_name_or_path, **kwargs): + model = InstructSentenceTransformerWrapper( + model_name_or_path, + revision=kwargs.pop("revision", None), + instruction_template=instruction_template, + apply_instruction_to_passages=False, + **kwargs, + ) + encoder = model.model._first_module() + if encoder.auto_model.config._attn_implementation == "flash_attention_2": + encoder.tokenizer.padding_side = "left" + return model + + +samilpwc_expr = ModelMeta( + loader=partial( + instruct_loader, + model_name_or_path="SamilPwC-AXNode-GenAI/PwC-Embedding_expr", + revision="33358978be40f36491045f9c2a359d38c3f50047", + ), + name="SamilPwC-AXNode-GenAI/PwC-Embedding_expr", + languages=[ + "kor-Hang", + ], + open_weights=True, + revision="33358978be40f36491045f9c2a359d38c3f50047", + release_date="2025-08-12", + n_parameters=560_000_000, + memory_usage_mb=2136, + embed_dim=1024, + license="apache-2.0", + max_tokens=514, + reference="https://huggingface.co/SamilPwC-AXNode-GenAI/PwC-Embedding_expr", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + public_training_code=None, + public_training_data=None, + adapted_from="intfloat/multilingual-e5-large-instruct", + training_datasets={ + **ME5_TRAINING_DATA, + **SAMILPWC_GENAI_TRAINING_DATA, + }, +) diff --git a/mteb/models/sentence_transformer_wrapper.py b/mteb/models/sentence_transformer_wrapper.py index 872a3fffe9..e9d5492803 100644 --- a/mteb/models/sentence_transformer_wrapper.py +++ b/mteb/models/sentence_transformer_wrapper.py @@ -45,6 +45,13 @@ def __init__( ): try: model_prompts = self.validate_task_to_prompt_name(self.model.prompts) + + if ( + len(self.model.prompts) == 2 + and self.model.prompts.get("query", "") == "" + and self.model.prompts.get("document", "") == "" + ): + model_prompts = None except KeyError: model_prompts = None logger.warning( @@ -89,24 +96,24 @@ def encode( Returns: The encoded sentences. """ + prompt = None prompt_name = None if self.model_prompts is not None: prompt_name = self.get_prompt_name( self.model_prompts, task_name, prompt_type ) + prompt = self.model_prompts.get(prompt_name, None) if prompt_name: logger.info( - f"Using prompt_name={prompt_name} for task={task_name} prompt_type={prompt_type}" + f"Using {prompt_name=} for task={task_name} {prompt_type=} with {prompt=}" ) else: - logger.info( - f"No model prompts found for task={task_name} prompt_type={prompt_type}" - ) + logger.info(f"No model prompts found for task={task_name} {prompt_type=}") logger.info(f"Encoding {len(sentences)} sentences.") embeddings = self.model.encode( sentences, - prompt_name=prompt_name, + prompt=prompt, **kwargs, ) if isinstance(embeddings, torch.Tensor): diff --git a/mteb/models/vi_vn_models.py b/mteb/models/vi_vn_models.py new file mode 100644 index 0000000000..8353d42445 --- /dev/null +++ b/mteb/models/vi_vn_models.py @@ -0,0 +1,167 @@ +from __future__ import annotations + +from functools import partial + +from mteb.model_meta import ModelMeta, sentence_transformers_loader + +greennode_embedding_large_vn_v1_training_data = { + "GreenNodeTableMarkdownRetrieval": ["train"], +} + +greennode_embedding_large_vn_v1 = ModelMeta( + name="GreenNode/GreenNode-Embedding-Large-VN-V1", + revision="660def1f6e1c8ecdf39f6f9c95829e3cf0cef837", + release_date="2024-04-11", + languages=[ + "vie-Latn", + ], + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="GreenNode/GreenNode-Embedding-Large-VN-V1", + ), + open_weights=True, + n_parameters=568_000_000, + memory_usage_mb=2167, + embed_dim=1024, + license="cc-by-4.0", + max_tokens=8194, + reference="https://huggingface.co/GreenNode/GreenNode-Embedding-Large-VN-V1", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data="https://huggingface.co/datasets/GreenNode/GreenNode-Table-Markdown-Retrieval-VN", + training_datasets=greennode_embedding_large_vn_v1_training_data, + adapted_from="BAAI/bge-m3", +) + +greennode_embedding_large_vn_mixed_v1 = ModelMeta( + name="GreenNode/GreenNode-Embedding-Large-VN-Mixed-V1", + revision="1d3dddb3862292dab4bd3eddf0664c0335ad5843", + release_date="2024-04-11", + languages=[ + "vie-Latn", + ], + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="GreenNode/GreenNode-Embedding-Large-VN-Mixed-V1", + ), + open_weights=True, + n_parameters=568_000_000, + memory_usage_mb=2167, + embed_dim=1024, + license="cc-by-4.0", + max_tokens=8194, + reference="https://huggingface.co/GreenNode/GreenNode-Embedding-Large-VN-Mixed-V1", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data="https://huggingface.co/datasets/GreenNode/GreenNode-Table-Markdown-Retrieval-VN", + training_datasets=greennode_embedding_large_vn_v1_training_data, + adapted_from="BAAI/bge-m3", +) + +aiteamvn_vietnamese_embeddings = ModelMeta( + name="AITeamVN/Vietnamese_Embedding", + revision="fcbbb905e6c3757d421aaa5db6fd7c53d038f6fb", + release_date="2024-03-17", + languages=[ + "vie-Latn", + ], + loader=partial( # type: ignore + sentence_transformers_loader, model_name="AITeamVN/Vietnamese_Embedding" + ), + open_weights=True, + n_parameters=568_000_000, + memory_usage_mb=2166, + embed_dim=1024, + license="cc-by-4.0", + max_tokens=8194, + reference="https://huggingface.co/AITeamVN/Vietnamese_Embedding", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, + adapted_from="BAAI/bge-m3", +) + +hiieu_halong_embedding = ModelMeta( + name="hiieu/halong_embedding", + revision="b57776031035f70ed2030d2e35ecc533eb0f8f71", + release_date="2024-07-06", + languages=[ + "vie-Latn", + ], + loader=partial( # type: ignore + sentence_transformers_loader, model_name="hiieu/halong_embedding" + ), + use_instructions=False, + open_weights=True, + n_parameters=278_000_000, + memory_usage_mb=1061, + embed_dim=768, + license="apache-2.0", + max_tokens=514, + reference="https://huggingface.co/hiieu/halong_embedding", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + public_training_code=None, + public_training_data=None, + training_datasets=None, + adapted_from="intfloat/multilingual-e5-base", +) + +sup_simcse_vietnamese_phobert_base_ = ModelMeta( + name="VoVanPhuc/sup-SimCSE-VietNamese-phobert-base", + revision="608779b86741a8acd8c8d38132974ff04086b138", + release_date="2021-05-26", + languages=[ + "vie-Latn", + ], + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="VoVanPhuc/sup-SimCSE-VietNamese-phobert-base", + ), + use_instructions=False, + open_weights=True, + n_parameters=135_000_000, + memory_usage_mb=517, + max_tokens=256, + embed_dim=768, + license="apache-2.0", + public_training_code=None, + public_training_data=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/VoVanPhuc/sup-SimCSE-VietNamese-phobert-base", + similarity_fn_name="cosine", + training_datasets=None, +) + +bkai_foundation_models_vietnamese_bi_encoder = ModelMeta( + name="bkai-foundation-models/vietnamese-bi-encoder", + revision="84f9d9ada0d1a3c37557398b9ae9fcedcdf40be0", + release_date="2023-09-09", + languages=[ + "vie-Latn", + ], + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="bkai-foundation-models/vietnamese-bi-encoder", + ), + use_instructions=False, + open_weights=True, + n_parameters=135_000_000, + memory_usage_mb=515, + max_tokens=256, + embed_dim=768, + license="apache-2.0", + public_training_code=None, + public_training_data=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/bkai-foundation-models/vietnamese-bi-encoder", + similarity_fn_name="cosine", + training_datasets=None, +) diff --git a/mteb/models/vlm2vec_models.py b/mteb/models/vlm2vec_models.py index 65ca7b4004..38823d77a7 100644 --- a/mteb/models/vlm2vec_models.py +++ b/mteb/models/vlm2vec_models.py @@ -20,7 +20,7 @@ logger = logging.getLogger(__name__) -EncodeTypes = Literal["query", "passage"] +EncodeTypes = Literal["query", "document"] class VLM2VecWrapper: diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index b1eb33442a..15c934d573 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -140,9 +140,34 @@ def _batched_encode( model_prompts = { PromptType.query.value: "query", - PromptType.passage.value: "document", + PromptType.document.value: "document", } +voyage_3_5 = ModelMeta( + name="voyageai/voyage-3.5", + revision="1", + release_date="2025-01-21", + languages=None, # supported languages not specified + loader=partial( + VoyageWrapper, + model_name="voyage-3.5", + model_prompts=model_prompts, + ), + max_tokens=32000, + embed_dim=1024, + open_weights=False, + n_parameters=None, + memory_usage_mb=None, + license=None, + reference="https://docs.voyageai.com/docs/embeddings", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=True, + training_datasets=VOYAGE_TRAINING_DATA, + public_training_code=None, + public_training_data=None, +) + voyage_large_2_instruct = ModelMeta( name="voyageai/voyage-large-2-instruct", revision="1", diff --git a/mteb/models/voyage_v.py b/mteb/models/voyage_v.py index 48a083574c..fada131681 100644 --- a/mteb/models/voyage_v.py +++ b/mteb/models/voyage_v.py @@ -87,7 +87,7 @@ def get_text_embeddings( **kwargs: Any, ): if input_type is None and prompt_type is not None: - if prompt_type == PromptType.passage: + if prompt_type == PromptType.document: input_type = "document" elif prompt_type == PromptType.query: input_type = "query" @@ -119,7 +119,7 @@ def get_image_embeddings( **kwargs: Any, ): if input_type is None and prompt_type is not None: - if prompt_type == PromptType.passage: + if prompt_type == PromptType.document: input_type = "document" elif prompt_type == PromptType.query: input_type = "query" @@ -175,7 +175,7 @@ def get_fused_embeddings( raise ValueError("Either texts or images must be provided") if input_type is None and prompt_type is not None: - if prompt_type == PromptType.passage: + if prompt_type == PromptType.document: input_type = "document" elif prompt_type == PromptType.query: input_type = "query" diff --git a/mteb/models/wrapper.py b/mteb/models/wrapper.py index 73ebdde341..68ec09ae62 100644 --- a/mteb/models/wrapper.py +++ b/mteb/models/wrapper.py @@ -35,7 +35,7 @@ def get_prompt_name( Args: task_to_prompt: The tasks names and their corresponding prompt_names task_name: The task name to use for building the encoding prompt - prompt_type: The prompt type (e.g. "query" | "passage") to use for building the encoding prompt + prompt_type: The prompt type (e.g. "query" | "document") to use for building the encoding prompt """ task = mteb.get_task(task_name=task_name) task_type = task.metadata.type @@ -74,7 +74,7 @@ def validate_task_to_prompt_name( prompt_types = [e.value for e in PromptType] for task_name in task_to_prompt_name: if "-" in task_name and task_name.endswith( - (f"-{PromptType.query.value}", f"-{PromptType.passage.value}") + (f"-{PromptType.query.value}", f"-{PromptType.document.value}") ): task_name, prompt_type = task_name.rsplit("-", 1) if prompt_type not in prompt_types: diff --git a/mteb/tasks/Audio/Any2AnyRetrieval/__init__.py b/mteb/tasks/Audio/Any2AnyRetrieval/__init__.py index 12fa0c0dd0..545c92164e 100644 --- a/mteb/tasks/Audio/Any2AnyRetrieval/__init__.py +++ b/mteb/tasks/Audio/Any2AnyRetrieval/__init__.py @@ -6,8 +6,8 @@ from .CMU_Arctic import * from .CommonVoice import * from .EmoVDB import * -from .GigaSpeech import * from .Fleurs import * +from .GigaSpeech import * from .HiFiTTS import * from .LibriTTS import * from .MACS import * diff --git a/mteb/tasks/BitextMining/__init__.py b/mteb/tasks/BitextMining/__init__.py index 3d3a4c3538..93e8d670b1 100644 --- a/mteb/tasks/BitextMining/__init__.py +++ b/mteb/tasks/BitextMining/__init__.py @@ -22,6 +22,7 @@ from .multilingual.NusaXBitextMining import * from .multilingual.PhincBitextMining import * from .multilingual.RomaTalesBitextMining import * +from .multilingual.RuSciBenchBitextMining import * from .multilingual.TatoebaBitextMining import * from .multilingual.WebFAQBitextMining import * from .srn.SRNCorpusBitextMining import * diff --git a/mteb/tasks/BitextMining/multilingual/RuSciBenchBitextMining.py b/mteb/tasks/BitextMining/multilingual/RuSciBenchBitextMining.py new file mode 100644 index 0000000000..f696b4abf0 --- /dev/null +++ b/mteb/tasks/BitextMining/multilingual/RuSciBenchBitextMining.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class RuSciBenchBitextMining(AbsTaskBitextMining, MultilingualTask): + fast_loading = True + metadata = TaskMetadata( + name="RuSciBenchBitextMining", + dataset={ + "path": "mlsa-iai-msu-lab/ru_sci_bench_bitext_mining", + "revision": "e5840033c5cf2573932db027ac8001fe0a7eb6fa", + }, + description="""This task focuses on finding translations of scientific articles. + The dataset is sourced from eLibrary, Russia's largest electronic library of scientific publications. + Russian authors often provide English translations for their abstracts and titles, + and the data consists of these paired titles and abstracts. The task evaluates a model's ability + to match an article's Russian title and abstract to its English counterpart, or vice versa.""", + reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb", + type="BitextMining", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs={ + "ru-en": ["rus-Cyrl", "eng-Latn"], + "en-ru": ["eng-Latn", "rus-Cyrl"], + }, + main_score="f1", + date=("2007-01-01", "2023-01-01"), + domains=["Academic", "Non-fiction", "Written"], + task_subtypes=[], + license="not specified", + dialect=[], + sample_creation="found", + annotations_creators="derived", + bibtex_citation=r""" +@article{vatolin2024ruscibench, + author = {Vatolin, A. and Gerasimenko, N. and Ianina, A. and Vorontsov, K.}, + doi = {10.1134/S1064562424602191}, + issn = {1531-8362}, + journal = {Doklady Mathematics}, + month = {12}, + number = {1}, + pages = {S251--S260}, + title = {RuSciBench: Open Benchmark for Russian and English Scientific Document Representations}, + url = {https://doi.org/10.1134/S1064562424602191}, + volume = {110}, + year = {2024}, +} +""", + prompt="Given the following title and abstract of the scientific article, find its translation", + ) diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py index 36592d1e0d..d70c94944d 100644 --- a/mteb/tasks/Classification/__init__.py +++ b/mteb/tasks/Classification/__init__.py @@ -16,6 +16,7 @@ from .ces.CzechSubjectivityClassification import * from .dan.AngryTweetsClassification import * from .dan.DanishPoliticalCommentsClassification import * +from .dan.DdiscoCohesionClassification import * from .dan.DKHateClassification import * from .dan.LccSentimentClassification import * from .deu.GermanPoliticiansTwitterSentimentClassification import * @@ -76,6 +77,7 @@ from .ita.ItalianLinguistAcceptabilityClassification import * from .ita.SardiStanceClassification import * from .jav.JavaneseIMDBClassification import * +from .jpn.JapaneseSentimentClassification import * from .jpn.WRIMEClassification import * from .kan.KannadaNewsClassification import * from .kor.KlueTC import * @@ -109,6 +111,8 @@ from .multilingual.NusaParagraphEmotionClassification import * from .multilingual.NusaParagraphTopicClassification import * from .multilingual.NusaXSenti import * +from .multilingual.ru_nlu_intent_classification import * +from .multilingual.RuSciBenchClassification import * from .multilingual.ScalaClassification import * from .multilingual.ScandiSentClassification import * from .multilingual.SIB200Classification import * @@ -132,7 +136,6 @@ from .rus.HeadlineClassification import * from .rus.InappropriatenessClassification import * from .rus.KinopoiskClassification import * -from .rus.ru_nlu_intent_classification import * from .rus.ru_toixic_classification_okmlcup import * from .rus.RuReviewsClassification import * from .rus.RuSciBenchGRNTIClassification import * @@ -160,6 +163,18 @@ from .tur.TurkishProductSentimentClassification import * from .ukr.UkrFormalityClassification import * from .urd.UrduRomanSentimentClassification import * +from .vie.AmazonCounterfactualVNClassification import * +from .vie.AmazonPolarityVNClassification import * +from .vie.AmazonReviewsVNClassification import * +from .vie.Banking77VNClassification import * +from .vie.EmotionVNClassification import * +from .vie.ImdbVNClassification import * +from .vie.MassiveIntentVNClassification import * +from .vie.MassiveScenarioVNClassification import * +from .vie.MTOPDomainVNClassification import * +from .vie.MTOPIntentVNClassification import * +from .vie.ToxicConversationsVNClassification import * +from .vie.TweetSentimentExtractionVNClassification import * from .vie.VieStudentFeedbackClassification import * from .zho.CMTEBClassification import * from .zho.YueOpenriceReviewClassification import ( diff --git a/mteb/tasks/Classification/ces/CSFDCZMovieReviewSentimentClassification.py b/mteb/tasks/Classification/ces/CSFDCZMovieReviewSentimentClassification.py index 9b1f68f0a3..ef3cea7172 100644 --- a/mteb/tasks/Classification/ces/CSFDCZMovieReviewSentimentClassification.py +++ b/mteb/tasks/Classification/ces/CSFDCZMovieReviewSentimentClassification.py @@ -5,6 +5,7 @@ class CSFDCZMovieReviewSentimentClassification(AbsTaskClassification): + superseded_by = "CSFDCZMovieReviewSentimentClassification.v2" metadata = TaskMetadata( name="CSFDCZMovieReviewSentimentClassification", description="The dataset contains 30k user reviews from csfd.cz in Czech.", @@ -49,3 +50,49 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["test"], n_samples=N_SAMPLES ) + + +class CSFDCZMovieReviewSentimentClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="CSFDCZMovieReviewSentimentClassification.v2", + description="""The dataset contains 30k user reviews from csfd.cz in Czech. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://arxiv.org/abs/2304.01922", + dataset={ + "path": "mteb/csfdcz_movie_review_sentiment", + "revision": "bda232f79c949fd881572f7e1b9ad59fd04a6c7c", + }, + type="Classification", + category="s2s", + modalities=["text"], + date=("2002-06-28", "2020-03-13"), + eval_splits=["test"], + eval_langs=["ces-Latn"], + main_score="accuracy", + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{štefánik2023resources, + archiveprefix = {arXiv}, + author = {Michal Štefánik and Marek Kadlčík and Piotr Gramacki and Petr Sojka}, + eprint = {2304.01922}, + primaryclass = {cs.CL}, + title = {Resources and Few-shot Learners for In-context Learning in Slavic Languages}, + year = {2023}, +} +""", + adapted_from=["CSFDCZMovieReviewSentimentClassification"], + ) + # Increase the samples_per_label in order to improve baseline performance + samples_per_label = 20 + + def dataset_transform(self): + N_SAMPLES = 2048 + + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"], n_samples=N_SAMPLES + ) diff --git a/mteb/tasks/Classification/ces/CzechProductReviewSentimentClassification.py b/mteb/tasks/Classification/ces/CzechProductReviewSentimentClassification.py index cd29ac4353..0bd1a85dd6 100644 --- a/mteb/tasks/Classification/ces/CzechProductReviewSentimentClassification.py +++ b/mteb/tasks/Classification/ces/CzechProductReviewSentimentClassification.py @@ -5,6 +5,7 @@ class CzechProductReviewSentimentClassification(AbsTaskClassification): + superseded_by = "CzechProductReviewSentimentClassification.v2" metadata = TaskMetadata( name="CzechProductReviewSentimentClassification", description="User reviews of products on Czech e-shop Mall.cz with 3 sentiment classes (positive, neutral, negative)", @@ -54,3 +55,54 @@ def dataset_transform(self) -> None: self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["test"] ) + + +class CzechProductReviewSentimentClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="CzechProductReviewSentimentClassification.v2", + description="""User reviews of products on Czech e-shop Mall.cz with 3 sentiment classes (positive, neutral, negative) + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://aclanthology.org/W13-1609/", + dataset={ + "path": "mteb/czech_product_review_sentiment", + "revision": "1a3fb305bde30eec7067ab15ad2db9f61b115ca2", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["ces-Latn"], + main_score="accuracy", + date=("2013-01-01", "2013-06-01"), + dialect=[], + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + sample_creation="found", + bibtex_citation=r""" +@inproceedings{habernal-etal-2013-sentiment, + address = {Atlanta, Georgia}, + author = {Habernal, Ivan and +Pt{\'a}{\v{c}}ek, Tom{\'a}{\v{s}} and +Steinberger, Josef}, + booktitle = {Proceedings of the 4th Workshop on Computational Approaches to Subjectivity, Sentiment and Social Media Analysis}, + editor = {Balahur, Alexandra and +van der Goot, Erik and +Montoyo, Andres}, + month = jun, + pages = {65--74}, + publisher = {Association for Computational Linguistics}, + title = {Sentiment Analysis in {C}zech Social Media Using Supervised Machine Learning}, + url = {https://aclanthology.org/W13-1609}, + year = {2013}, +} +""", + adapted_from=["CzechProductReviewSentimentClassification"], + ) + samples_per_label = 16 + + def dataset_transform(self) -> None: + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/mteb/tasks/Classification/ces/CzechSoMeSentimentClassification.py b/mteb/tasks/Classification/ces/CzechSoMeSentimentClassification.py index 333cd3aa4a..a2052eee43 100644 --- a/mteb/tasks/Classification/ces/CzechSoMeSentimentClassification.py +++ b/mteb/tasks/Classification/ces/CzechSoMeSentimentClassification.py @@ -5,6 +5,7 @@ class CzechSoMeSentimentClassification(AbsTaskClassification): + superseded_by = "CzechSoMeSentimentClassification.v2" metadata = TaskMetadata( name="CzechSoMeSentimentClassification", description="User comments on Facebook", @@ -51,3 +52,49 @@ def dataset_transform(self) -> None: self.dataset = self.dataset.rename_columns( {"comment": "text", "sentiment_int": "label"} ) + + +class CzechSoMeSentimentClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="CzechSoMeSentimentClassification.v2", + description="""User comments on Facebook + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://aclanthology.org/W13-1609/", + dataset={ + "path": "mteb/czech_so_me_sentiment", + "revision": "a12152e40ff9857bf3c83694528f40ec5c02aafc", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["ces-Latn"], + main_score="accuracy", + date=("2013-01-01", "2013-06-01"), + dialect=[], + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + sample_creation="found", + bibtex_citation=r""" +@inproceedings{habernal-etal-2013-sentiment, + address = {Atlanta, Georgia}, + author = {Habernal, Ivan and +Pt{\'a}{\v{c}}ek, Tom{\'a}{\v{s}} and +Steinberger, Josef}, + booktitle = {Proceedings of the 4th Workshop on Computational Approaches to Subjectivity, Sentiment and Social Media Analysis}, + editor = {Balahur, Alexandra and +van der Goot, Erik and +Montoyo, Andres}, + month = jun, + pages = {65--74}, + publisher = {Association for Computational Linguistics}, + title = {Sentiment Analysis in {C}zech Social Media Using Supervised Machine Learning}, + url = {https://aclanthology.org/W13-1609}, + year = {2013}, +} +""", + adapted_from=["CzechSoMeSentimentClassification"], + ) + samples_per_label = 16 diff --git a/mteb/tasks/Classification/dan/AngryTweetsClassification.py b/mteb/tasks/Classification/dan/AngryTweetsClassification.py index 886612db48..08251acc5c 100644 --- a/mteb/tasks/Classification/dan/AngryTweetsClassification.py +++ b/mteb/tasks/Classification/dan/AngryTweetsClassification.py @@ -5,6 +5,7 @@ class AngryTweetsClassification(AbsTaskClassification): + superseded_by = "AngryTweetsClassification.v2" metadata = TaskMetadata( name="AngryTweetsClassification", dataset={ @@ -39,3 +40,42 @@ class AngryTweetsClassification(AbsTaskClassification): ) samples_per_label = 16 + + +class AngryTweetsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="AngryTweetsClassification.v2", + dataset={ + "path": "mteb/angry_tweets", + "revision": "b9475fb66a13befda4fa9871cd92343bb2c0eb77", + }, + description="""A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://aclanthology.org/2021.nodalida-main.53/", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["dan-Latn"], + main_score="accuracy", + date=("2021-01-01", "2021-12-31"), + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{pauli2021danlp, + author = {Pauli, Amalie Brogaard and Barrett, Maria and Lacroix, Oph{\'e}lie and Hvingelby, Rasmus}, + booktitle = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)}, + pages = {460--466}, + title = {DaNLP: An open-source toolkit for Danish Natural Language Processing}, + year = {2021}, +} +""", + prompt="Classify Danish tweets by sentiment. (positive, negative, neutral).", + adapted_from=["AngryTweetsClassification"], + ) + + samples_per_label = 16 diff --git a/mteb/tasks/Classification/dan/DKHateClassification.py b/mteb/tasks/Classification/dan/DKHateClassification.py index e67e3ebee0..e101be40b1 100644 --- a/mteb/tasks/Classification/dan/DKHateClassification.py +++ b/mteb/tasks/Classification/dan/DKHateClassification.py @@ -5,6 +5,7 @@ class DKHateClassification(AbsTaskClassification): + superseded_by = "DKHateClassification.v2" metadata = TaskMetadata( name="DKHateClassification", dataset={ @@ -69,3 +70,64 @@ def dataset_transform(self): self.dataset = self.dataset.map( lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"] ) + + +class DKHateClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="DKHateClassification.v2", + dataset={ + "path": "mteb/dk_hate", + "revision": "0468ff11393992d8347cf4282fb706fe970608d4", + }, + description="""Danish Tweets annotated for Hate Speech either being Offensive or not + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://aclanthology.org/2020.lrec-1.430/", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["dan-Latn"], + main_score="accuracy", + date=("2018-01-01", "2018-12-31"), + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{sigurbergsson-derczynski-2020-offensive, + abstract = {The presence of offensive language on social media platforms and the implications this poses is becoming a major concern in modern society. Given the enormous amount of content created every day, automatic methods are required to detect and deal with this type of content. Until now, most of the research has focused on solving the problem for the English language, while the problem is multilingual. We construct a Danish dataset DKhate containing user-generated comments from various social media platforms, and to our knowledge, the first of its kind, annotated for various types and target of offensive language. We develop four automatic classification systems, each designed to work for both the English and the Danish language. In the detection of offensive language in English, the best performing system achieves a macro averaged F1-score of 0.74, and the best performing system for Danish achieves a macro averaged F1-score of 0.70. In the detection of whether or not an offensive post is targeted, the best performing system for English achieves a macro averaged F1-score of 0.62, while the best performing system for Danish achieves a macro averaged F1-score of 0.73. Finally, in the detection of the target type in a targeted offensive post, the best performing system for English achieves a macro averaged F1-score of 0.56, and the best performing system for Danish achieves a macro averaged F1-score of 0.63. Our work for both the English and the Danish language captures the type and targets of offensive language, and present automatic methods for detecting different kinds of offensive language such as hate speech and cyberbullying.}, + address = {Marseille, France}, + author = {Sigurbergsson, Gudbjartur Ingi and +Derczynski, Leon}, + booktitle = {Proceedings of the Twelfth Language Resources and Evaluation Conference}, + editor = {Calzolari, Nicoletta and +B{\'e}chet, Fr{\'e}d{\'e}ric and +Blache, Philippe and +Choukri, Khalid and +Cieri, Christopher and +Declerck, Thierry and +Goggi, Sara and +Isahara, Hitoshi and +Maegaard, Bente and +Mariani, Joseph and +Mazo, H{\'e}l{\`e}ne and +Moreno, Asuncion and +Odijk, Jan and +Piperidis, Stelios}, + isbn = {979-10-95546-34-4}, + language = {English}, + month = may, + pages = {3498--3508}, + publisher = {European Language Resources Association}, + title = {Offensive Language and Hate Speech Detection for {D}anish}, + url = {https://aclanthology.org/2020.lrec-1.430}, + year = {2020}, +} +""", + prompt="Classify Danish tweets based on offensiveness (offensive, not offensive)", + adapted_from=["DKHateClassification"], + ) + + samples_per_label = 16 diff --git a/mteb/tasks/Classification/dan/DanishPoliticalCommentsClassification.py b/mteb/tasks/Classification/dan/DanishPoliticalCommentsClassification.py index c0bac1528e..7846b73870 100644 --- a/mteb/tasks/Classification/dan/DanishPoliticalCommentsClassification.py +++ b/mteb/tasks/Classification/dan/DanishPoliticalCommentsClassification.py @@ -5,6 +5,7 @@ class DanishPoliticalCommentsClassification(AbsTaskClassification): + superseded_by = "DanishPoliticalCommentsClassification.v2" metadata = TaskMetadata( name="DanishPoliticalCommentsClassification", dataset={ @@ -49,3 +50,44 @@ def dataset_transform(self): # create train and test splits self.dataset = self.dataset["train"].train_test_split(0.2, seed=self.seed) + + +class DanishPoliticalCommentsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="DanishPoliticalCommentsClassification.v2", + dataset={ + "path": "mteb/danish_political_comments", + "revision": "476a9e7327aba70ad3e97a169d7310b86be9b245", + }, + description="""A dataset of Danish political comments rated for sentiment + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/danish_political_comments", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["dan-Latn"], + main_score="accuracy", + date=( + "2000-01-01", + "2022-12-31", + ), # Estimated range for the collection of comments + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@techreport{SAMsentiment, + author = {Mads Guldborg Kjeldgaard Kongsbak and Steffan Eybye Christensen and Lucas Høyberg Puvis~de~Chavannes and Peter Due Jensen}, + institution = {IT University of Copenhagen}, + title = {Sentiment Analysis Multitool, SAM}, + year = {2019}, +} +""", + prompt="Classify Danish political comments for sentiment", + adapted_from=["DanishPoliticalCommentsClassification"], + ) + + samples_per_label = 16 diff --git a/mteb/tasks/Classification/dan/DdiscoCohesionClassification.py b/mteb/tasks/Classification/dan/DdiscoCohesionClassification.py index c1eb16d190..3a0b229517 100644 --- a/mteb/tasks/Classification/dan/DdiscoCohesionClassification.py +++ b/mteb/tasks/Classification/dan/DdiscoCohesionClassification.py @@ -5,6 +5,7 @@ class DdiscoCohesionClassification(AbsTaskClassification): + superseded_by = "Ddisco.v2" metadata = TaskMetadata( name="Ddisco", dataset={ @@ -62,3 +63,60 @@ def dataset_transform(self): self.dataset = self.dataset.rename_columns({"rating": "label"}).remove_columns( ["domain"] ) + + +class DdiscoCohesionClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="Ddisco.v2", + dataset={ + "path": "mteb/ddisco_cohesion", + "revision": "b5a05bdecdfc6efc14eebc8f7a86e0986edaf5ff", + }, + description="""A Danish Discourse dataset with values for coherence and source (Wikipedia or Reddit) + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://aclanthology.org/2022.lrec-1.260/", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["dan-Latn"], + main_score="accuracy", + date=("2021-01-01", "2022-06-25"), + domains=["Non-fiction", "Social", "Written"], + dialect=[], + task_subtypes=["Discourse coherence"], + license="cc-by-sa-3.0", + annotations_creators="expert-annotated", + sample_creation="found", + bibtex_citation=r""" +@inproceedings{flansmose-mikkelsen-etal-2022-ddisco, + abstract = {To date, there has been no resource for studying discourse coherence on real-world Danish texts. Discourse coherence has mostly been approached with the assumption that incoherent texts can be represented by coherent texts in which sentences have been shuffled. However, incoherent real-world texts rarely resemble that. We thus present DDisCo, a dataset including text from the Danish Wikipedia and Reddit annotated for discourse coherence. We choose to annotate real-world texts instead of relying on artificially incoherent text for training and testing models. Then, we evaluate the performance of several methods, including neural networks, on the dataset.}, + address = {Marseille, France}, + author = {Flansmose Mikkelsen, Linea and +Kinch, Oliver and +Jess Pedersen, Anders and +Lacroix, Oph{\'e}lie}, + booktitle = {Proceedings of the Thirteenth Language Resources and Evaluation Conference}, + editor = {Calzolari, Nicoletta and +B{\'e}chet, Fr{\'e}d{\'e}ric and +Blache, Philippe and +Choukri, Khalid and +Cieri, Christopher and +Declerck, Thierry and +Goggi, Sara and +Isahara, Hitoshi and +Maegaard, Bente and +Mariani, Joseph and +Mazo, H{\'e}l{\`e}ne and +Odijk, Jan and +Piperidis, Stelios}, + month = jun, + pages = {2440--2445}, + publisher = {European Language Resources Association}, + title = {{DD}is{C}o: A Discourse Coherence Dataset for {D}anish}, + url = {https://aclanthology.org/2022.lrec-1.260}, + year = {2022}, +} +""", + adapted_from=["DdiscoCohesionClassification"], + ) diff --git a/mteb/tasks/Classification/deu/GermanPoliticiansTwitterSentimentClassification.py b/mteb/tasks/Classification/deu/GermanPoliticiansTwitterSentimentClassification.py index 979a70c707..fa61715eea 100644 --- a/mteb/tasks/Classification/deu/GermanPoliticiansTwitterSentimentClassification.py +++ b/mteb/tasks/Classification/deu/GermanPoliticiansTwitterSentimentClassification.py @@ -5,6 +5,7 @@ class GermanPoliticiansTwitterSentimentClassification(AbsTaskClassification): + superseded_by = "GermanPoliticiansTwitterSentimentClassification.v2" metadata = TaskMetadata( name="GermanPoliticiansTwitterSentimentClassification", description="GermanPoliticiansTwitterSentiment is a dataset of German tweets categorized with their sentiment (3 classes).", @@ -52,3 +53,52 @@ class GermanPoliticiansTwitterSentimentClassification(AbsTaskClassification): def dataset_transform(self): self.dataset = self.dataset.rename_column("majority_sentiment", "label") + + +class GermanPoliticiansTwitterSentimentClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="GermanPoliticiansTwitterSentimentClassification.v2", + description="""GermanPoliticiansTwitterSentiment is a dataset of German tweets categorized with their sentiment (3 classes). + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://aclanthology.org/2022.konvens-1.9", + dataset={ + "path": "mteb/german_politicians_twitter_sentiment", + "revision": "aeb7e9cd08a0c77856ec5396bb82c32f309276d0", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["deu-Latn"], + main_score="accuracy", + date=("2021-01-01", "2021-12-31"), + domains=["Social", "Government", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{schmidt-etal-2022-sentiment, + address = {Potsdam, Germany}, + author = {Schmidt, Thomas and +Fehle, Jakob and +Weissenbacher, Maximilian and +Richter, Jonathan and +Gottschalk, Philipp and +Wolff, Christian}, + booktitle = {Proceedings of the 18th Conference on Natural Language Processing (KONVENS 2022)}, + editor = {Schaefer, Robin and +Bai, Xiaoyu and +Stede, Manfred and +Zesch, Torsten}, + month = {12--15 } # sep, + pages = {74--87}, + publisher = {KONVENS 2022 Organizers}, + title = {Sentiment Analysis on {T}witter for the Major {G}erman Parties during the 2021 {G}erman Federal Election}, + url = {https://aclanthology.org/2022.konvens-1.9}, + year = {2022}, +} +""", + adapted_from=["GermanPoliticiansTwitterSentimentClassification"], + ) diff --git a/mteb/tasks/Classification/deu/TenKGnadClassification.py b/mteb/tasks/Classification/deu/TenKGnadClassification.py index e9b2316d2d..ca91bf4715 100644 --- a/mteb/tasks/Classification/deu/TenKGnadClassification.py +++ b/mteb/tasks/Classification/deu/TenKGnadClassification.py @@ -5,6 +5,7 @@ class TenKGnadClassification(AbsTaskClassification): + superseded_by = "TenKGnadClassification.v2" metadata = TaskMetadata( name="TenKGnadClassification", description="10k German News Articles Dataset (10kGNAD) contains news articles from the online Austrian newspaper website DER Standard with their topic classification (9 classes).", @@ -39,3 +40,42 @@ class TenKGnadClassification(AbsTaskClassification): } """, ) + + +class TenKGnadClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="TenKGnadClassification.v2", + description="""10k German News Articles Dataset (10kGNAD) contains news articles from the online Austrian newspaper website DER Standard with their topic classification (9 classes). + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://tblock.github.io/10kGNAD/", + dataset={ + "path": "mteb/ten_k_gnad", + "revision": "fc6825fe0d813e7fc92f05fe63ac4bb3ee191c4d", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["deu-Latn"], + main_score="accuracy", + date=("2015-06-01", "2016-05-31"), + domains=["News", "Written"], + task_subtypes=["Topic classification"], + license="cc-by-nc-sa-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{Schabus2017, + address = {Tokyo, Japan}, + author = {Dietmar Schabus and Marcin Skowron and Martin Trapp}, + booktitle = {Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR)}, + doi = {10.1145/3077136.3080711}, + month = aug, + pages = {1241--1244}, + title = {One Million Posts: A Data Set of German Online Discussions}, + year = {2017}, +} +""", + adapted_from=["TenKGnadClassification"], + ) diff --git a/mteb/tasks/Classification/eng/AmazonPolarityClassification.py b/mteb/tasks/Classification/eng/AmazonPolarityClassification.py index 3c5b1350f1..7333e4c2dc 100644 --- a/mteb/tasks/Classification/eng/AmazonPolarityClassification.py +++ b/mteb/tasks/Classification/eng/AmazonPolarityClassification.py @@ -5,6 +5,7 @@ class AmazonPolarityClassification(AbsTaskClassification): + superseded_by = "AmazonPolarityClassification.v2" metadata = TaskMetadata( name="AmazonPolarityClassification", description="Amazon Polarity Classification Dataset.", @@ -40,3 +41,43 @@ class AmazonPolarityClassification(AbsTaskClassification): """, prompt="Classify Amazon reviews into positive or negative sentiment", ) + + +class AmazonPolarityClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="AmazonPolarityClassification.v2", + description="""Amazon Polarity Classification Dataset. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/amazon_polarity", + dataset={ + "path": "mteb/amazon_polarity", + "revision": "ec149c1fe36043668a50804214d4597804001f6f", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=( + "2012-01-01", + "2015-12-31", + ), # Estimated range for the collection of reviews + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + annotations_creators="derived", + license="apache-2.0", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{McAuley2013HiddenFA, + author = {Julian McAuley and Jure Leskovec}, + journal = {Proceedings of the 7th ACM conference on Recommender systems}, + title = {Hidden factors and hidden topics: understanding rating dimensions with review text}, + url = {https://api.semanticscholar.org/CorpusID:6440341}, + year = {2013}, +} +""", + prompt="Classify Amazon reviews into positive or negative sentiment", + adapted_from=["AmazonPolarityClassification"], + ) diff --git a/mteb/tasks/Classification/eng/ArxivClassification.py b/mteb/tasks/Classification/eng/ArxivClassification.py index d046bd3bd0..438b018ff2 100644 --- a/mteb/tasks/Classification/eng/ArxivClassification.py +++ b/mteb/tasks/Classification/eng/ArxivClassification.py @@ -5,6 +5,7 @@ class ArxivClassification(AbsTaskClassification): + superseded_by = "ArxivClassification.v2" metadata = TaskMetadata( name="ArxivClassification", description="Classification Dataset of Arxiv Papers", @@ -39,3 +40,42 @@ class ArxivClassification(AbsTaskClassification): } """, ) + + +class ArxivClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="ArxivClassification.v2", + description="""Classification Dataset of Arxiv Papers + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + dataset={ + "path": "mteb/arxiv", + "revision": "202e10e9a5d37a5068397b48184d0728346a7b4a", + }, + reference="https://ieeexplore.ieee.org/document/8675939", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("1998-11-11", "2019-03-28"), + domains=["Academic", "Written"], + task_subtypes=["Topic classification"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{8675939, + author = {He, Jun and Wang, Liqun and Liu, Liu and Feng, Jiao and Wu, Hao}, + doi = {10.1109/ACCESS.2019.2907992}, + journal = {IEEE Access}, + number = {}, + pages = {40707-40718}, + title = {Long Document Classification From Local Word Glimpses via Recurrent Attention Learning}, + volume = {7}, + year = {2019}, +} +""", + adapted_from=["ArxivClassification"], + ) diff --git a/mteb/tasks/Classification/eng/Banking77Classification.py b/mteb/tasks/Classification/eng/Banking77Classification.py index 5581df7fb0..f3543eec87 100644 --- a/mteb/tasks/Classification/eng/Banking77Classification.py +++ b/mteb/tasks/Classification/eng/Banking77Classification.py @@ -5,6 +5,7 @@ class Banking77Classification(AbsTaskClassification): + superseded_by = "Banking77Classification.v2" metadata = TaskMetadata( name="Banking77Classification", description="Dataset composed of online banking queries annotated with their corresponding intents.", @@ -57,3 +58,60 @@ class Banking77Classification(AbsTaskClassification): """, prompt="Given a online banking query, find the corresponding intents", ) + + +class Banking77ClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="Banking77Classification.v2", + description="""Dataset composed of online banking queries annotated with their corresponding intents. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://arxiv.org/abs/2003.04807", + dataset={ + "path": "mteb/banking77", + "revision": "18072d2685ea682290f7b8924d94c62acc19c0b2", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=( + "2019-01-01", + "2019-12-31", + ), # Estimated range for the collection of queries + domains=["Written"], + task_subtypes=[], + license="mit", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{casanueva-etal-2020-efficient, + address = {Online}, + author = {Casanueva, I{\~n}igo and +Tem{\v{c}}inas, Tadas and +Gerz, Daniela and +Henderson, Matthew and +Vuli{\'c}, Ivan}, + booktitle = {Proceedings of the 2nd Workshop on Natural Language Processing for Conversational AI}, + doi = {10.18653/v1/2020.nlp4convai-1.5}, + editor = {Wen, Tsung-Hsien and +Celikyilmaz, Asli and +Yu, Zhou and +Papangelis, Alexandros and +Eric, Mihail and +Kumar, Anuj and +Casanueva, I{\~n}igo and +Shah, Rushin}, + month = jul, + pages = {38--45}, + publisher = {Association for Computational Linguistics}, + title = {Efficient Intent Detection with Dual Sentence Encoders}, + url = {https://aclanthology.org/2020.nlp4convai-1.5}, + year = {2020}, +} +""", + prompt="Given a online banking query, find the corresponding intents", + adapted_from=["Banking77Classification"], + ) diff --git a/mteb/tasks/Classification/eng/DBpediaClassification.py b/mteb/tasks/Classification/eng/DBpediaClassification.py index 51904a4c08..6764e7934c 100644 --- a/mteb/tasks/Classification/eng/DBpediaClassification.py +++ b/mteb/tasks/Classification/eng/DBpediaClassification.py @@ -5,6 +5,7 @@ class DBpediaClassification(AbsTaskClassification): + superseded_by = "DBpediaClassification.v2" metadata = TaskMetadata( name="DBpediaClassification", description="DBpedia14 is a dataset of English texts from Wikipedia articles, categorized into 14 non-overlapping classes based on their DBpedia ontology.", @@ -46,3 +47,48 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["train", "test"] ) + + +class DBpediaClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="DBpediaClassification.v2", + description="""DBpedia14 is a dataset of English texts from Wikipedia articles, categorized into 14 non-overlapping classes based on their DBpedia ontology. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://arxiv.org/abs/1509.01626", + dataset={ + "path": "mteb/d_bpedia", + "revision": "e45aab5cbb44baba43d8a0640d809d2aa0a0a770", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2022-01-25", "2022-01-25"), + domains=["Encyclopaedic", "Written"], + task_subtypes=["Topic classification"], + license="cc-by-sa-3.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{NIPS2015_250cf8b5, + author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett}, + pages = {}, + publisher = {Curran Associates, Inc.}, + title = {Character-level Convolutional Networks for Text Classification}, + url = {https://proceedings.neurips.cc/paper_files/paper/2015/file/250cf8b51c773f3f8dc8b4be867a9a02-Paper.pdf}, + volume = {28}, + year = {2015}, +} +""", + adapted_from=["DBpediaClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["train", "test"] + ) diff --git a/mteb/tasks/Classification/eng/EmotionClassification.py b/mteb/tasks/Classification/eng/EmotionClassification.py index d04d8143a6..fcee2a86a7 100644 --- a/mteb/tasks/Classification/eng/EmotionClassification.py +++ b/mteb/tasks/Classification/eng/EmotionClassification.py @@ -5,6 +5,7 @@ class EmotionClassification(AbsTaskClassification): + superseded_by = "EmotionClassification.v2" metadata = TaskMetadata( name="EmotionClassification", description="Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise.", @@ -56,3 +57,59 @@ class EmotionClassification(AbsTaskClassification): ) samples_per_label = 16 + + +class EmotionClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="EmotionClassification.v2", + description="""Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://www.aclweb.org/anthology/D18-1404", + dataset={ + "path": "mteb/emotion", + "revision": "13535ec7ed83ac3920c40db3c3fd4133af55cc06", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=( + "2017-01-01", + "2018-12-31", + ), # Estimated range for the collection of Twitter messages + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{saravia-etal-2018-carer, + abstract = {Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks.}, + address = {Brussels, Belgium}, + author = {Saravia, Elvis and +Liu, Hsien-Chi Toby and +Huang, Yen-Hao and +Wu, Junlin and +Chen, Yi-Shin}, + booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, + doi = {10.18653/v1/D18-1404}, + editor = {Riloff, Ellen and +Chiang, David and +Hockenmaier, Julia and +Tsujii, Jun{'}ichi}, + month = oct # {-} # nov, + pages = {3687--3697}, + publisher = {Association for Computational Linguistics}, + title = {{CARER}: Contextualized Affect Representations for Emotion Recognition}, + url = {https://aclanthology.org/D18-1404}, + year = {2018}, +} +""", + prompt="Classify the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise", + adapted_from=["EmotionClassification"], + ) + + samples_per_label = 16 diff --git a/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py b/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py index 6d3f672f9d..1d3580e6e3 100644 --- a/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py +++ b/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py @@ -5,6 +5,7 @@ class FinancialPhrasebankClassification(AbsTaskClassification): + superseded_by = "FinancialPhrasebankClassification.v2" metadata = TaskMetadata( name="FinancialPhrasebankClassification", description="Polar sentiment dataset of sentences from financial news, categorized by sentiment into positive, negative, or neutral.", @@ -41,3 +42,40 @@ class FinancialPhrasebankClassification(AbsTaskClassification): def dataset_transform(self): self.dataset = self.dataset.rename_column("sentence", "text") + + +class FinancialPhrasebankClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="FinancialPhrasebankClassification.v2", + description="""Polar sentiment dataset of sentences from financial news, categorized by sentiment into positive, negative, or neutral. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://arxiv.org/abs/1307.5336", + dataset={ + "path": "mteb/financial_phrasebank", + "revision": "9349ecd31615a97081c245f5d7dbc0f4c6a1a656", + "name": "sentences_allagree", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2013-11-01", "2013-11-01"), + domains=["News", "Written", "Financial"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-nc-sa-3.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{Malo2014GoodDO, + author = {P. Malo and A. Sinha and P. Korhonen and J. Wallenius and P. Takala}, + journal = {Journal of the Association for Information Science and Technology}, + title = {Good debt or bad debt: Detecting semantic orientations in economic texts}, + volume = {65}, + year = {2014}, +} +""", + adapted_from=["FinancialPhrasebankClassification"], + ) diff --git a/mteb/tasks/Classification/eng/FrenkEnClassification.py b/mteb/tasks/Classification/eng/FrenkEnClassification.py index b9de110e20..db7a0e25d1 100644 --- a/mteb/tasks/Classification/eng/FrenkEnClassification.py +++ b/mteb/tasks/Classification/eng/FrenkEnClassification.py @@ -5,6 +5,7 @@ class FrenkEnClassification(AbsTaskClassification): + superseded_by = "FrenkEnClassification.v2" metadata = TaskMetadata( name="FrenkEnClassification", description="English subset of the FRENK dataset", @@ -39,3 +40,41 @@ class FrenkEnClassification(AbsTaskClassification): } """, ) + + +class FrenkEnClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="FrenkEnClassification.v2", + description="""English subset of the FRENK dataset + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + dataset={ + "path": "mteb/frenk_en", + "revision": "630d941b6e0879a7238da89af6bfe1b1eb27ca0f", + }, + reference="https://arxiv.org/abs/1906.02045", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2021-05-28", "2021-05-28"), + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{ljubešić2019frenk, + archiveprefix = {arXiv}, + author = {Nikola Ljubešić and Darja Fišer and Tomaž Erjavec}, + eprint = {1906.02045}, + primaryclass = {cs.CL}, + title = {The FRENK Datasets of Socially Unacceptable Discourse in Slovene and English}, + url = {https://arxiv.org/abs/1906.02045}, + year = {2019}, +} +""", + adapted_from=["FrenkEnClassification"], + ) diff --git a/mteb/tasks/Classification/eng/ImdbClassification.py b/mteb/tasks/Classification/eng/ImdbClassification.py index df2ac734ed..2318fcb26c 100644 --- a/mteb/tasks/Classification/eng/ImdbClassification.py +++ b/mteb/tasks/Classification/eng/ImdbClassification.py @@ -5,6 +5,7 @@ class ImdbClassification(AbsTaskClassification): + superseded_by = "ImdbClassification.v2" metadata = TaskMetadata( name="ImdbClassification", description="Large Movie Review Dataset", @@ -52,3 +53,55 @@ class ImdbClassification(AbsTaskClassification): """, prompt="Classify the sentiment expressed in the given movie review text from the IMDB dataset", ) + + +class ImdbClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="ImdbClassification.v2", + description="""Large Movie Review Dataset + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + dataset={ + "path": "mteb/imdb", + "revision": "d05f0155defa7991dad75bc68c5ccb6774b1fdc5", + }, + reference="http://www.aclweb.org/anthology/P11-1015", + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=( + "2000-01-01", + "2010-12-31", + ), # Estimated range for the collection of movie reviews + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{maas-etal-2011-learning, + address = {Portland, Oregon, USA}, + author = {Maas, Andrew L. and +Daly, Raymond E. and +Pham, Peter T. and +Huang, Dan and +Ng, Andrew Y. and +Potts, Christopher}, + booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies}, + editor = {Lin, Dekang and +Matsumoto, Yuji and +Mihalcea, Rada}, + month = jun, + pages = {142--150}, + publisher = {Association for Computational Linguistics}, + title = {Learning Word Vectors for Sentiment Analysis}, + url = {https://aclanthology.org/P11-1015}, + year = {2011}, +} +""", + prompt="Classify the sentiment expressed in the given movie review text from the IMDB dataset", + adapted_from=["ImdbClassification"], + ) diff --git a/mteb/tasks/Classification/eng/LegalBenchClassification.py b/mteb/tasks/Classification/eng/LegalBenchClassification.py index d19df22a19..96862494a5 100644 --- a/mteb/tasks/Classification/eng/LegalBenchClassification.py +++ b/mteb/tasks/Classification/eng/LegalBenchClassification.py @@ -3447,6 +3447,7 @@ def dataset_transform(self): class JCrewBlockerLegalBenchClassification(AbsTaskClassification): + superseded_by = "JCrewBlockerLegalBenchClassification.v2" metadata = TaskMetadata( name="JCrewBlockerLegalBenchClassification", description="The J.Crew Blocker, also known as the J.Crew Protection, is a provision included in leveraged loan documents to prevent companies from removing security by transferring intellectual property (IP) into new subsidiaries and raising additional debt. The task consists of detemining whether the J.Crew Blocker is present in the document.", @@ -3491,6 +3492,44 @@ def dataset_transform(self): self.dataset = self.dataset.rename_column("answer", "label") +class JCrewBlockerLegalBenchClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="JCrewBlockerLegalBenchClassification.v2", + description="""The J.Crew Blocker, also known as the J.Crew Protection, is a provision included in leveraged loan documents to prevent companies from removing security by transferring intellectual property (IP) into new subsidiaries and raising additional debt. The task consists of detemining whether the J.Crew Blocker is present in the document. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/nguha/legalbench", + dataset={ + "path": "mteb/j_crew_blocker_legal_bench", + "name": "jcrew_blocker", + "revision": "692cc80266711eaa41d03c9fb168bff60807ee8a", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2016-01-01", "2023-08-23"), # best guess + domains=["Legal", "Written"], + task_subtypes=[], + license="cc-by-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", + adapted_from=["JCrewBlockerLegalBenchClassification"], + ) + + class LearnedHandsBenefitsLegalBenchClassification(AbsTaskClassification): metadata = TaskMetadata( name="LearnedHandsBenefitsLegalBenchClassification", @@ -4362,6 +4401,7 @@ def dataset_transform(self): class LegalReasoningCausalityLegalBenchClassification(AbsTaskClassification): + superseded_by = "LegalReasoningCausalityLegalBenchClassification.v2" metadata = TaskMetadata( name="LegalReasoningCausalityLegalBenchClassification", description="Given an excerpt from a district court opinion, classify if it relies on statistical evidence in its reasoning.", @@ -4406,6 +4446,44 @@ def dataset_transform(self): self.dataset = self.dataset.rename_column("answer", "label") +class LegalReasoningCausalityLegalBenchClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="LegalReasoningCausalityLegalBenchClassification.v2", + description="""Given an excerpt from a district court opinion, classify if it relies on statistical evidence in its reasoning. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/nguha/legalbench", + dataset={ + "path": "mteb/legal_reasoning_causality_legal_bench", + "name": "legal_reasoning_causality", + "revision": "563c52ea5216784b608912e67049226ae8cdf702", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2020-01-01", "2023-08-23"), # best guess + domains=["Legal", "Written"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", + adapted_from=["LegalReasoningCausalityLegalBenchClassification"], + ) + + _MAUD_DATASET_MAP = [ { "name": "maud_ability_to_consummate_concept_is_subject_to_mae_carveouts", @@ -4549,6 +4627,7 @@ def dataset_transform(self): class MAUDLegalBenchClassification(AbsTaskClassification): + superseded_by = "MAUDLegalBenchClassification.v2" metadata = TaskMetadata( name="MAUDLegalBenchClassification", description="""This task was constructed from the MAUD dataset, which consists of over 47,000 labels across 152 merger agreements annotated to identify 92 questions in each agreement used by the 2021 American Bar Association (ABA) Public Target Deal Points Study. Each dataset is formatted as a series of multiple-choice questions, where given a segment of the merger agreement and a Deal Point question, the model is to choose the answer that best characterizes the agreement as response. @@ -4692,6 +4771,93 @@ def dataset_transform(self): ) +class MAUDLegalBenchClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="MAUDLegalBenchClassification.v2", + description="""This task was constructed from the MAUD dataset, which consists of over 47,000 labels across 152 merger agreements annotated to identify 92 questions in each agreement used by the 2021 American Bar Association (ABA) Public Target Deal Points Study. Each dataset is formatted as a series of multiple-choice questions, where given a segment of the merger agreement and a Deal Point question, the model is to choose the answer that best characterizes the agreement as response. + + This is a combination of all 34 of the MAUD Legal Bench datasets: + 1. MAUD Ability To Consummate Concept Is Subject To MAE Carveouts: Given an excerpt from a merger agreement and the task is to answer: is the “ability to consummate” concept subject to Material Adverse Effect (MAE) carveouts? amongst the multiple choice options. + 2. MAUD Accuracy Of Fundamental Target RWS Bringdown Standard: Given an excerpt from a merger agreement and the task is to answer: how accurate must the fundamental representations and warranties be according to the bring down provision, amongst the multiple choice options. + 3. MAUD Accuracy Of Target Capitalization RW Outstanding Shares Bringdown Standard Answer: Given an excerpt from a merger agreement and the task is to answer: how accurate must the fundamental representations and warranties be according to the bring down provision, amongst the multiple choice options. + 4. MAUD Accuracy Of Target General RW Bringdown Timing Answer: Given an excerpt from a merger agreement and the task is to answer: how accurate must the fundamental representations and warranties be according to the bring down provision, amongst the multiple choice options. + 5. MAUD Additional Matching Rights Period For Modifications Cor: Given an excerpt from a merger agreement and the task is to answer: how long is the additional matching rights period for modifications in case the board changes its recommendation, amongst the multiple choice options. + 6. MAUD Application Of Buyer Consent Requirement Negative Interim Covenant: Given an excerpt from a merger agreement and the task is to answer: what negative covenants does the requirement of Buyer consent apply to, amongst the multiple choice options. + 7. MAUD Buyer Consent Requirement Ordinary Course: Given an excerpt from a merger agreement and the task is to answer: in case the Buyer's consent for the acquired company's ordinary business operations is required, are there any limitations on the Buyer's right to condition, withhold, or delay their consent, amongst the multiple choice options. + 8. MAUD Change In Law Subject To Disproportionate Impact Modifier: Given an excerpt from a merger agreement and the task is to answer: do changes in law that have disproportionate impact qualify for Material Adverse Effect (MAE), amongst the multiple choice options. + 9. MAUD Changes In GAAP Or Other Accounting Principles Subject To Disproportionate Impact Modifier: Given an excerpt from a merger agreement and the task is to answer: do changes in GAAP or other accounting principles that have disproportionate impact qualify for Material Adverse Effect (MAE), amongst the multiple choice options. + 10. MAUD COR Permitted In Response To Intervening Event: Given an excerpt from a merger agreement and the task is to answer: is Change of Recommendation permitted in response to an intervening event, amongst the multiple choice options. + 11. MAUD COR Permitted With Board Fiduciary Determination Only: Given an excerpt from a merger agreement and the task is to answer: is Change of Recommendation permitted as long as the board determines that such change is required to fulfill its fiduciary obligations, amongst the multiple choice options. + 12. MAUD COR Standard Intervening Event: Given an excerpt from a merger agreement and the task is to answer: what standard should the board follow when determining whether to change its recommendation in response to an intervening event, amongst the multiple choice options. + 13. MAUD COR Standard Superior Offer: Given an excerpt from a merger agreement and the task is to answer: what standard should the board follow when determining whether to change its recommendation in connection with a superior offer, amongst the multiple choice options. + 14. MAUD Definition Contains Knowledge Requirement Answer: Given an excerpt from a merger agreement and the task is to answer: what is the knowledge requirement in the definition of “Intervening Event”, amongst the multiple choice options. + 15. MAUD Definition Includes Asset Deals: Given an excerpt from a merger agreement and the task is to answer: what qualifies as a superior offer in terms of asset deals, amongst the multiple choice options. + 16. MAUD Definition Includes Stock Deals: Given an excerpt from a merger agreement and the task is to answer: what qualifies as a superior offer in terms of stock deals, amongst the multiple choice options. + 17. MAUD Fiduciary Exception Board Determination Standard: Given an excerpt from a merger agreement and the task is to answer: under what circumstances could the Board take actions on a different acquisition proposal notwithstanding the no-shop provision, amongst the multiple choice options. + 18. MAUD Fiduciary Exception Board Determination Trigger No Shop: Given an excerpt from a merger agreement and the task is to answer: what type of offer could the Board take actions on notwithstanding the no-shop provision, amongst the multiple choice options. + 19. MAUD Financial Point Of View Is The Sole Consideration: Given an excerpt from a merger agreement and the task is to answer: is “financial point of view” the sole consideration when determining whether an offer is superior, amongst the multiple choice options. + 20. MAUD FLS MAE Standard: Given an excerpt from a merger agreement and the task is to answer: what is the Forward Looking Standard (FLS) with respect to Material Adverse Effect (MAE), amongst the multiple choice options. + 21. MAUD General Economic and Financial Conditions Subject To Disproportionate Impact Modifier: Given an excerpt from a merger agreement and the task is to answer: do changes caused by general economic and financial conditions that have disproportionate impact qualify for Material Adverse Effect (MAE), amongst the multiple choice options. + 22. MAUD Includes Consistent With Past Practice: Given an excerpt from a merger agreement and the task is to answer: does the wording of the Efforts Covenant clause include “consistent with past practice”, amongst the multiple choice options. + 23. MAUD Initial Matching Rights Period COR: Given an excerpt from a merger agreement and the task is to answer: how long is the initial matching rights period in case the board changes its recommendation, amongst the multiple choice options. + 24. MAUD Initial Matching Rights Period FTR: Given an excerpt from a merger agreement and the task is to answer: how long is the initial matching rights period in connection with the Fiduciary Termination Right (FTR), amongst the multiple choice options. + 25. MAUDInterveningEventRequiredToOccurAfterSigningAnswer: Given an excerpt from a merger agreement and the task is to answer: is an “Intervening Event” required to occur after signing, amongst the multiple choice options. + 26. MAUD Knowledge Definition: Given an excerpt from a merger agreement and the task is to answer: what counts as Knowledge, amongst the multiple choice options. + 27. MAUDLiabilityStandardForNoShopBreachByTargetNonDORepresentatives: Given an excerpt from a merger agreement and the task is to answer: what is the liability standard for no-shop breach by Target Non-D&O Representatives, amongst the multiple choice options. + 28. MAUD Ordinary Course Efforts Standard: Given an excerpt from a merger agreement and the task is to answer: what is the efforts standard, amongst the multiple choice options. + 29. MAUD Pandemic Or Other Public Health Event Subject To Disproportionate Impact Modifier: Given an excerpt from a merger agreement and the task is to answer: do pandemics or other public health events have to have disproportionate impact to qualify for Material Adverse Effect (MAE), amongst the multiple choice options. + 30. MAUD Pandemic Or Other Public Health Event Specific Reference To Pandemic Related Governmental Responses Or Measures: Given an excerpt from a merger agreement and the task is to answer: is there specific reference to pandemic-related governmental responses or measures in the clause that qualifies pandemics or other public health events for Material Adverse Effect (MAE), amongst the multiple choice options. + 31. MAUD Relational Language MAE Applies To: Given an excerpt from a merger agreement and the task is to answer: what carveouts pertaining to Material Adverse Effect (MAE) does the relational language apply to?, amongst the multiple choice options. + 32. MAUD Specific Performance: Given an excerpt from a merger agreement and the task is to answer: what is the wording of the Specific Performance clause regarding the parties' entitlement in the event of a contractual breach, amongst the multiple choice options. + 33. MAUD Tail Period Length: Given an excerpt from a merger agreement and the task is to answer: how long is the Tail Period, amongst the multiple choice options. + 34. MAUD Type Of Consideration: Given an excerpt from a merger agreement and the task is to answer: what type of consideration is specified in this agreement, amongst the multiple choice options. + + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/nguha/legalbench", + dataset={ + "path": "mteb/maud_legal_bench", + "revision": "655744e3745703e6f551e78b4c4cba1702774ce3", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2021-01-01", "2023-08-23"), + domains=["Legal", "Written"], + task_subtypes=[], + license="cc-by-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{wang2023maud, + author = {Wang, Steven H and Scardigli, Antoine and Tang, Leonard and Chen, Wei and Levkin, Dimitry and Chen, Anya and Ball, Spencer and Woodside, Thomas and Zhang, Oliver and Hendrycks, Dan}, + journal = {arXiv preprint arXiv:2301.00876}, + title = {MAUD: An Expert-Annotated Legal NLP Dataset for Merger Agreement Understanding}, + year = {2023}, +} +""", + adapted_from=["MAUDLegalBenchClassification"], + ) + + def dataset_transform(self): + # The train split has one example in each dataset, so we combine it with the test split and resample + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) + + class NYSJudicialEthicsLegalBenchClassification(AbsTaskClassification): metadata = TaskMetadata( name="NYSJudicialEthicsLegalBenchClassification", @@ -4793,6 +4959,7 @@ def dataset_transform(self): class OPP115DataSecurityLegalBenchClassification(AbsTaskClassification): + superseded_by = "OPP115DataSecurityLegalBenchClassification.v2" metadata = TaskMetadata( name="OPP115DataSecurityLegalBenchClassification", description="Given a clause from a privacy policy, classify if the clause describes how user information is protected.", @@ -4845,7 +5012,54 @@ def dataset_transform(self): self.dataset = self.dataset.rename_column("answer", "label") +class OPP115DataSecurityLegalBenchClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="OPP115DataSecurityLegalBenchClassification.v2", + description="""Given a clause from a privacy policy, classify if the clause describes how user information is protected. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/nguha/legalbench", + dataset={ + "path": "mteb/opp115_data_security_legal_bench", + "name": "opp115_data_security", + "revision": "8596086d90fa4f2574b15d96a60cb6bc9889806b", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2015-01-01", "2023-08-23"), + domains=["Legal", "Written"], + task_subtypes=[], + license="cc-by-nc-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@inproceedings{wilson2016creation, + author = {Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, + booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + pages = {1330--1340}, + title = {The creation and analysis of a website privacy policy corpus}, + year = {2016}, +} +""", + adapted_from=["OPP115DataSecurityLegalBenchClassification"], + ) + + class OPP115DoNotTrackLegalBenchClassification(AbsTaskClassification): + superseded_by = "OPP115DoNotTrackLegalBenchClassification.v2" metadata = TaskMetadata( name="OPP115DoNotTrackLegalBenchClassification", description="Given a clause from a privacy policy, classify if the clause describes if and how Do Not Track signals for online tracking and advertising are honored.", @@ -4898,6 +5112,52 @@ def dataset_transform(self): self.dataset = self.dataset.rename_column("answer", "label") +class OPP115DoNotTrackLegalBenchClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="OPP115DoNotTrackLegalBenchClassification.v2", + description="""Given a clause from a privacy policy, classify if the clause describes if and how Do Not Track signals for online tracking and advertising are honored. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/nguha/legalbench", + dataset={ + "path": "mteb/opp115_do_not_track_legal_bench", + "name": "opp115_do_not_track", + "revision": "3e2cc83cd3fc98dc6d76825c21ed4fbed86d560c", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2015-01-01", "2023-08-23"), + domains=["Legal", "Written"], + task_subtypes=[], + license="cc-by-nc-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@inproceedings{wilson2016creation, + author = {Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, + booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + pages = {1330--1340}, + title = {The creation and analysis of a website privacy policy corpus}, + year = {2016}, +} +""", + adapted_from=["OPP115DoNotTrackLegalBenchClassification"], + ) + + class OPP115FirstPartyCollectionUseLegalBenchClassification(AbsTaskClassification): metadata = TaskMetadata( name="OPP115FirstPartyCollectionUseLegalBenchClassification", @@ -5166,6 +5426,7 @@ def dataset_transform(self): class OPP115UserChoiceControlLegalBenchClassification(AbsTaskClassification): + superseded_by = "OPP115UserChoiceControlLegalBenchClassification.v2" metadata = TaskMetadata( name="OPP115UserChoiceControlLegalBenchClassification", description="Given a clause fro ma privacy policy, classify if the clause describes the choices and control options available to users.", @@ -5218,7 +5479,54 @@ def dataset_transform(self): self.dataset = self.dataset.rename_column("answer", "label") +class OPP115UserChoiceControlLegalBenchClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="OPP115UserChoiceControlLegalBenchClassification.v2", + description="""Given a clause fro ma privacy policy, classify if the clause describes the choices and control options available to users. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/nguha/legalbench", + dataset={ + "path": "mteb/opp115_user_choice_control_legal_bench", + "name": "opp115_user_choice_control", + "revision": "f308b16f8baee2080cf43e28ff01d93032d51eee", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2015-01-01", "2023-08-23"), + domains=["Legal", "Written"], + task_subtypes=[], + license="cc-by-nc-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@inproceedings{wilson2016creation, + author = {Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, + booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + pages = {1330--1340}, + title = {The creation and analysis of a website privacy policy corpus}, + year = {2016}, +} +""", + adapted_from=["OPP115UserChoiceControlLegalBenchClassification"], + ) + + class OralArgumentQuestionPurposeLegalBenchClassification(AbsTaskClassification): + superseded_by = "OralArgumentQuestionPurposeLegalBenchClassification.v2" metadata = TaskMetadata( name="OralArgumentQuestionPurposeLegalBenchClassification", description="""This task classifies questions asked by Supreme Court justices at oral argument into seven categories: @@ -5267,7 +5575,54 @@ def dataset_transform(self): ) +class OralArgumentQuestionPurposeLegalBenchClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="OralArgumentQuestionPurposeLegalBenchClassification.v2", + description="""This task classifies questions asked by Supreme Court justices at oral argument into seven categories: + 1. Background - questions seeking factual or procedural information that is missing or not clear in the briefing + 2. Clarification - questions seeking to get an advocate to clarify her position or the scope of the rule being advocated for + 3. Implications - questions about the limits of a rule or its implications for future cases + 4. Support - questions offering support for the advocate’s position + 5. Criticism - questions criticizing an advocate’s position + 6. Communicate - question designed primarily to communicate with other justices + 7. Humor - questions designed to interject humor into the argument and relieve tension + + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/nguha/legalbench", + dataset={ + "path": "mteb/oral_argument_question_purpose_legal_bench", + "name": "oral_argument_question_purpose", + "revision": "cdc020e244cb846ce4e0325cb602cf04126c79d2", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2021-01-01", "2023-08-23"), # best guess + domains=["Legal", "Written"], + task_subtypes=[], + license="cc-by-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", + adapted_from=["OralArgumentQuestionPurposeLegalBenchClassification"], + ) + + class OverrulingLegalBenchClassification(AbsTaskClassification): + superseded_by = "OverrulingLegalBenchClassification.v2" metadata = TaskMetadata( name="OverrulingLegalBenchClassification", description="""This task consists of classifying whether or not a particular sentence of case law overturns the decision of a previous case.""", @@ -5323,6 +5678,57 @@ def dataset_transform(self): ) +class OverrulingLegalBenchClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="OverrulingLegalBenchClassification.v2", + description="""This task consists of classifying whether or not a particular sentence of case law overturns the decision of a previous case. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/nguha/legalbench", + dataset={ + "path": "mteb/overruling_legal_bench", + "name": "overruling", + "revision": "fee708d1959b3258bc3e408afdd3e6c2051adf80", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("1965-01-01", "2023-08-23"), + domains=["Legal", "Written"], + task_subtypes=[], + license="cc-by-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@inproceedings{zheng2021does, + author = {Zheng, Lucia and Guha, Neel and Anderson, Brandon R and Henderson, Peter and Ho, Daniel E}, + booktitle = {Proceedings of the eighteenth international conference on artificial intelligence and law}, + pages = {159--168}, + title = {When does pretraining help? assessing self-supervised learning for law and the casehold dataset of 53,000+ legal holdings}, + year = {2021}, +} +""", + adapted_from=["OverrulingLegalBenchClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) + + class PersonalJurisdictionLegalBenchClassification(AbsTaskClassification): metadata = TaskMetadata( name="PersonalJurisdictionLegalBenchClassification", diff --git a/mteb/tasks/Classification/eng/NewsClassification.py b/mteb/tasks/Classification/eng/NewsClassification.py index aec198d5c1..3fba46de16 100644 --- a/mteb/tasks/Classification/eng/NewsClassification.py +++ b/mteb/tasks/Classification/eng/NewsClassification.py @@ -5,6 +5,7 @@ class NewsClassification(AbsTaskClassification): + superseded_by = "NewsClassification.v2" metadata = TaskMetadata( name="NewsClassification", description="Large News Classification Dataset", @@ -43,3 +44,46 @@ class NewsClassification(AbsTaskClassification): } """, ) + + +class NewsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="NewsClassification.v2", + description="""Large News Classification Dataset + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + dataset={ + "path": "mteb/news", + "revision": "7c1f485c1f43d6aef852c5df6db23b047991a8e7", + }, + reference="https://arxiv.org/abs/1509.01626", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=( + "2004-01-01", + "2015-12-31", + ), # Estimated range for the collection of news articles + domains=["News", "Written"], + task_subtypes=["Topic classification"], + license="apache-2.0", + annotations_creators="expert-annotated", + dialect=["eng-Latn-US", "en-Latn-GB", "en-Latn-AU"], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{NIPS2015_250cf8b5, + author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett}, + pages = {}, + publisher = {Curran Associates, Inc.}, + title = {Character-level Convolutional Networks for Text Classification}, + url = {https://proceedings.neurips.cc/paper_files/paper/2015/file/250cf8b51c773f3f8dc8b4be867a9a02-Paper.pdf}, + volume = {28}, + year = {2015}, +} +""", + adapted_from=["NewsClassification"], + ) diff --git a/mteb/tasks/Classification/eng/PatentClassification.py b/mteb/tasks/Classification/eng/PatentClassification.py index 270bde1698..0598118b0f 100644 --- a/mteb/tasks/Classification/eng/PatentClassification.py +++ b/mteb/tasks/Classification/eng/PatentClassification.py @@ -5,6 +5,7 @@ class PatentClassification(AbsTaskClassification): + superseded_by = "PatentClassification.v2" metadata = TaskMetadata( name="PatentClassification", description="Classification Dataset of Patents and Abstract", @@ -47,3 +48,49 @@ class PatentClassification(AbsTaskClassification): } """, ) + + +class PatentClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="PatentClassification.v2", + description="""Classification Dataset of Patents and Abstract + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + dataset={ + "path": "mteb/patent", + "revision": "f5e5c81286448c68264300fe1e6f3de599922890", + }, + reference="https://aclanthology.org/P19-1212.pdf", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2021-11-05", "2022-10-22"), + domains=["Legal", "Written"], + task_subtypes=["Topic classification"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{sharma-etal-2019-bigpatent, + abstract = {Most existing text summarization datasets are compiled from the news domain, where summaries have a flattened discourse structure. In such datasets, summary-worthy content often appears in the beginning of input articles. Moreover, large segments from input articles are present verbatim in their respective summaries. These issues impede the learning and evaluation of systems that can understand an article{'}s global content structure as well as produce abstractive summaries with high compression ratio. In this work, we present a novel dataset, BIGPATENT, consisting of 1.3 million records of U.S. patent documents along with human written abstractive summaries. Compared to existing summarization datasets, BIGPATENT has the following properties: i) summaries contain a richer discourse structure with more recurring entities, ii) salient content is evenly distributed in the input, and iii) lesser and shorter extractive fragments are present in the summaries. Finally, we train and evaluate baselines and popular learning models on BIGPATENT to shed light on new challenges and motivate future directions for summarization research.}, + address = {Florence, Italy}, + author = {Sharma, Eva and +Li, Chen and +Wang, Lu}, + booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, + doi = {10.18653/v1/P19-1212}, + editor = {Korhonen, Anna and +Traum, David and +M{\`a}rquez, Llu{\'\i}s}, + month = jul, + pages = {2204--2213}, + publisher = {Association for Computational Linguistics}, + title = {{BIGPATENT}: A Large-Scale Dataset for Abstractive and Coherent Summarization}, + url = {https://aclanthology.org/P19-1212}, + year = {2019}, +} +""", + ) diff --git a/mteb/tasks/Classification/eng/PoemSentimentClassification.py b/mteb/tasks/Classification/eng/PoemSentimentClassification.py index 8671929fea..c330171c58 100644 --- a/mteb/tasks/Classification/eng/PoemSentimentClassification.py +++ b/mteb/tasks/Classification/eng/PoemSentimentClassification.py @@ -5,6 +5,7 @@ class PoemSentimentClassification(AbsTaskClassification): + superseded_by = "PoemSentimentClassification.v2" metadata = TaskMetadata( name="PoemSentimentClassification", description="Poem Sentiment is a sentiment dataset of poem verses from Project Gutenberg.", @@ -41,3 +42,40 @@ class PoemSentimentClassification(AbsTaskClassification): def dataset_transform(self): self.dataset = self.dataset.rename_column("verse_text", "text") + + +class PoemSentimentClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="PoemSentimentClassification.v2", + description="""Poem Sentiment is a sentiment dataset of poem verses from Project Gutenberg. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://arxiv.org/abs/2011.02686", + dataset={ + "path": "mteb/poem_sentiment", + "revision": "9fdc57b89ccc09a8d9256f376112d626878e51a7", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["validation", "test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("1700-01-01", "1900-01-01"), + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=["eng-Latn-US", "en-Latn-GB"], + sample_creation="found", + bibtex_citation=r""" +@misc{sheng2020investigating, + archiveprefix = {arXiv}, + author = {Emily Sheng and David Uthus}, + eprint = {2011.02686}, + primaryclass = {cs.CL}, + title = {Investigating Societal Biases in a Poetry Composition System}, + year = {2020}, +} +""", + adapted_from=["PoemSentimentClassification"], + ) diff --git a/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py b/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py index 5ae0df8602..5afbc2e13f 100644 --- a/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py +++ b/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py @@ -5,6 +5,7 @@ class SDSEyeProtectionClassification(AbsTaskClassification): + superseded_by = "SDSEyeProtectionClassification.v2" metadata = TaskMetadata( name="SDSEyeProtectionClassification", description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", @@ -43,3 +44,46 @@ class SDSEyeProtectionClassification(AbsTaskClassification): } """, ) + + +class SDSEyeProtectionClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="SDSEyeProtectionClassification.v2", + description="""ChemTEB evaluates the performance of text embedding models on chemical domain data. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "mteb/sds_eye_protection", + "revision": "ead011d2286d5395fea054d2282ca0478ceb7cfb", + }, + type="Classification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="LM-generated and reviewed", + dialect=[], + sample_creation="created", + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} + +@inproceedings{pereira2020msds, + author = {Pereira, Eliseu}, + booktitle = {15th Doctoral Symposium}, + pages = {42}, + title = {MSDS-OPP: Operator Procedures Prediction in Material Safety Data Sheets}, + year = {2020}, +} +""", + adapted_from=["SDSEyeProtectionClassification"], + ) diff --git a/mteb/tasks/Classification/eng/SDSGlovesClassification.py b/mteb/tasks/Classification/eng/SDSGlovesClassification.py index 41b68096db..cc5c173bd0 100644 --- a/mteb/tasks/Classification/eng/SDSGlovesClassification.py +++ b/mteb/tasks/Classification/eng/SDSGlovesClassification.py @@ -5,6 +5,7 @@ class SDSGlovesClassification(AbsTaskClassification): + superseded_by = "SDSGlovesClassification.v2" metadata = TaskMetadata( name="SDSGlovesClassification", description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", @@ -43,3 +44,46 @@ class SDSGlovesClassification(AbsTaskClassification): } """, ) + + +class SDSGlovesClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="SDSGlovesClassification.v2", + description="""ChemTEB evaluates the performance of text embedding models on chemical domain data. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "mteb/sds_gloves", + "revision": "09b09ee755ada02c68ad835971e22b1959d79448", + }, + type="Classification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="LM-generated and reviewed", + dialect=[], + sample_creation="created", + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} + +@inproceedings{pereira2020msds, + author = {Pereira, Eliseu}, + booktitle = {15th Doctoral Symposium}, + pages = {42}, + title = {MSDS-OPP: Operator Procedures Prediction in Material Safety Data Sheets}, + year = {2020}, +} +""", + adapted_from=["SDSGlovesClassification"], + ) diff --git a/mteb/tasks/Classification/eng/ToxicChatClassification.py b/mteb/tasks/Classification/eng/ToxicChatClassification.py index e189cd51a0..b591659ba7 100644 --- a/mteb/tasks/Classification/eng/ToxicChatClassification.py +++ b/mteb/tasks/Classification/eng/ToxicChatClassification.py @@ -7,6 +7,7 @@ class ToxicChatClassification(AbsTaskClassification): + superseded_by = "ToxicChatClassification.v2" metadata = TaskMetadata( name="ToxicChatClassification", description="""This dataset contains toxicity annotations on 10K user @@ -64,3 +65,54 @@ def dataset_transform(self): # only use human-annotated data self.dataset = self.dataset.filter(lambda x: x["human_annotation"]) self.dataset = self.dataset.remove_columns(remove_cols) + + +class ToxicChatClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="ToxicChatClassification.v2", + description="""This dataset contains toxicity annotations on 10K user + prompts collected from the Vicuna online demo. We utilize a human-AI + collaborative annotation framework to guarantee the quality of annotation + while maintaining a feasible annotation workload. The details of data + collection, pre-processing, and annotation can be found in our paper. + We believe that ToxicChat can be a valuable resource to drive further + advancements toward building a safe and healthy environment for user-AI + interactions. + Only human annotated samples are selected here. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://aclanthology.org/2023.findings-emnlp.311/", + dataset={ + "path": "mteb/toxic_chat", + "name": "toxicchat0124", + "revision": "800fec53e44419d13668be291aca50a071ab5849", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=_EVAL_SPLITS, + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2023-10-26", "2024-01-31"), + domains=["Constructed", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{lin2023toxicchat, + archiveprefix = {arXiv}, + author = {Zi Lin and Zihan Wang and Yongqi Tong and Yangkun Wang and Yuxin Guo and Yujia Wang and Jingbo Shang}, + eprint = {2310.17389}, + primaryclass = {cs.CL}, + title = {ToxicChat: Unveiling Hidden Challenges of Toxicity Detection in Real-World User-AI Conversation}, + year = {2023}, +} +""", + adapted_from=["ToxicChatClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/mteb/tasks/Classification/eng/ToxicConversationsClassification.py b/mteb/tasks/Classification/eng/ToxicConversationsClassification.py index 439b19ba7c..c847c40ceb 100644 --- a/mteb/tasks/Classification/eng/ToxicConversationsClassification.py +++ b/mteb/tasks/Classification/eng/ToxicConversationsClassification.py @@ -5,6 +5,7 @@ class ToxicConversationsClassification(AbsTaskClassification): + superseded_by = "ToxicConversationsClassification.v2" metadata = TaskMetadata( name="ToxicConversationsClassification", description="Collection of comments from the Civil Comments platform together with annotations if the comment is toxic or not.", @@ -47,3 +48,50 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["test"] ) + + +class ToxicConversationsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="ToxicConversationsClassification.v2", + description="""Collection of comments from the Civil Comments platform together with annotations if the comment is toxic or not. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/overview", + dataset={ + "path": "mteb/toxic_conversations", + "revision": "7ae55309fbe51a11e13c24887ceed200153514e9", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=( + "2017-01-01", + "2018-12-31", + ), # Estimated range for the collection of comments + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{jigsaw-unintended-bias-in-toxicity-classification, + author = {cjadams and Daniel Borkan and inversion and Jeffrey Sorensen and Lucas Dixon and Lucy Vasserman and nithum}, + publisher = {Kaggle}, + title = {Jigsaw Unintended Bias in Toxicity Classification}, + url = {https://kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification}, + year = {2019}, +} +""", + prompt="Classify the given comments as either toxic or not toxic", + adapted_from=["ToxicConversationsClassification"], + ) + + samples_per_label = 16 + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/mteb/tasks/Classification/eng/TweetSentimentExtractionClassification.py b/mteb/tasks/Classification/eng/TweetSentimentExtractionClassification.py index a4ab4b5c70..bddfff5221 100644 --- a/mteb/tasks/Classification/eng/TweetSentimentExtractionClassification.py +++ b/mteb/tasks/Classification/eng/TweetSentimentExtractionClassification.py @@ -5,6 +5,7 @@ class TweetSentimentExtractionClassification(AbsTaskClassification): + superseded_by = "TweetSentimentExtractionClassification.v2" metadata = TaskMetadata( name="TweetSentimentExtractionClassification", description="", @@ -42,3 +43,45 @@ class TweetSentimentExtractionClassification(AbsTaskClassification): ) samples_per_label = 32 + + +class TweetSentimentExtractionClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="TweetSentimentExtractionClassification.v2", + description=""" + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://www.kaggle.com/competitions/tweet-sentiment-extraction/overview", + dataset={ + "path": "mteb/tweet_sentiment_extraction", + "revision": "7261898ee3b9a739595e8dbf41df6b2332f429bb", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=( + "2020-01-01", + "2020-12-31", + ), # Estimated range for the collection of tweets + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{tweet-sentiment-extraction, + author = {Maggie, Phil Culliton, Wei Chen}, + publisher = {Kaggle}, + title = {Tweet Sentiment Extraction}, + url = {https://kaggle.com/competitions/tweet-sentiment-extraction}, + year = {2020}, +} +""", + prompt="Classify the sentiment of a given tweet as either positive, negative, or neutral", + adapted_from=["TweetSentimentExtractionClassification"], + ) + + samples_per_label = 32 diff --git a/mteb/tasks/Classification/eng/TweetTopicSingleClassification.py b/mteb/tasks/Classification/eng/TweetTopicSingleClassification.py index cad250dac0..6e3edb64ea 100644 --- a/mteb/tasks/Classification/eng/TweetTopicSingleClassification.py +++ b/mteb/tasks/Classification/eng/TweetTopicSingleClassification.py @@ -5,6 +5,7 @@ class TweetTopicSingleClassification(AbsTaskClassification): + superseded_by = "TweetTopicSingleClassification.v2" metadata = TaskMetadata( name="TweetTopicSingleClassification", description="""Topic classification dataset on Twitter with 6 labels. Each instance of @@ -52,3 +53,51 @@ class TweetTopicSingleClassification(AbsTaskClassification): def dataset_transform(self): self.dataset["train"] = self.dataset["train_2021"] + + +class TweetTopicSingleClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="TweetTopicSingleClassification.v2", + description="""Topic classification dataset on Twitter with 6 labels. Each instance of + TweetTopic comes with a timestamp which distributes from September 2019 to August 2021. + Tweets were preprocessed before the annotation to normalize some artifacts, converting + URLs into a special token {{URL}} and non-verified usernames into {{USERNAME}}. For verified + usernames, we replace its display name (or account name) with symbols {@}. + + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + dataset={ + "path": "mteb/tweet_topic_single", + "revision": "a7904e26081f987da81ad2cc063e09e714e875d0", + }, + reference="https://arxiv.org/abs/2209.09824", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test_2021"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2019-09-01", "2021-08-31"), + domains=["Social", "News", "Written"], + task_subtypes=["Topic classification"], + license="not specified", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{dimosthenis-etal-2022-twitter, + address = {Gyeongju, Republic of Korea}, + author = {Antypas, Dimosthenis and +Ushio, Asahi and +Camacho-Collados, Jose and +Neves, Leonardo and +Silva, Vitor and +Barbieri, Francesco}, + booktitle = {Proceedings of the 29th International Conference on Computational Linguistics}, + month = oct, + publisher = {International Committee on Computational Linguistics}, + title = {{T}witter {T}opic {C}lassification}, + year = {2022}, +} +""", + adapted_from=["TweetTopicSingleClassification"], + ) diff --git a/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py b/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py index c6ade13b66..746f68f4aa 100644 --- a/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py @@ -5,6 +5,7 @@ class WikipediaBioMetChemClassification(AbsTaskClassification): + superseded_by = "WikipediaBioMetChemClassification.v2" metadata = TaskMetadata( name="WikipediaBioMetChemClassification", description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", @@ -35,3 +36,38 @@ class WikipediaBioMetChemClassification(AbsTaskClassification): } """, ) + + +class WikipediaBioMetChemClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaBioMetChemClassification.v2", + description="""ChemTEB evaluates the performance of text embedding models on chemical domain data. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "mteb/wikipedia_bio_met_chem", + "revision": "5341bdf799e94949b5c2684bb54bfe462c528179", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", + adapted_from=["WikipediaBioMetChemClassification"], + ) diff --git a/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py b/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py index 940bcdc44e..133ea90055 100644 --- a/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py @@ -5,6 +5,7 @@ class WikipediaChemFieldsClassification(AbsTaskClassification): + superseded_by = "WikipediaChemFieldsClassification.v2" metadata = TaskMetadata( name="WikipediaChemFieldsClassification", description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", @@ -35,3 +36,38 @@ class WikipediaChemFieldsClassification(AbsTaskClassification): } """, ) + + +class WikipediaChemFieldsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaChemFieldsClassification.v2", + description="""ChemTEB evaluates the performance of text embedding models on chemical domain data. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "mteb/wikipedia_chem_fields", + "revision": "70ff36ec5157fcc54c59f3b85d2b1fb232f8feec", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", + adapted_from=["WikipediaChemFieldsClassification"], + ) diff --git a/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py b/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py index 8ee7c5b145..4f145eaa3c 100644 --- a/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py @@ -5,6 +5,7 @@ class WikipediaCompChemSpectroscopyClassification(AbsTaskClassification): + superseded_by = "WikipediaCompChemSpectroscopyClassification.v2" metadata = TaskMetadata( name="WikipediaCompChemSpectroscopyClassification", description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", @@ -35,3 +36,38 @@ class WikipediaCompChemSpectroscopyClassification(AbsTaskClassification): } """, ) + + +class WikipediaCompChemSpectroscopyClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaCompChemSpectroscopyClassification.v2", + description="""ChemTEB evaluates the performance of text embedding models on chemical domain data. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "mteb/wikipedia_comp_chem_spectroscopy", + "revision": "a1ef45291dc5304482c42b9c053a5f7801e1006b", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", + adapted_from=["WikipediaCompChemSpectroscopyClassification"], + ) diff --git a/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py b/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py index 9bc991261a..52c6136167 100644 --- a/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py @@ -5,6 +5,7 @@ class WikipediaCrystallographyAnalyticalClassification(AbsTaskClassification): + superseded_by = "WikipediaCrystallographyAnalyticalClassification.v2" metadata = TaskMetadata( name="WikipediaCrystallographyAnalyticalClassification", description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", @@ -35,3 +36,38 @@ class WikipediaCrystallographyAnalyticalClassification(AbsTaskClassification): } """, ) + + +class WikipediaCrystallographyAnalyticalClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaCrystallographyAnalyticalClassification.v2", + description="""ChemTEB evaluates the performance of text embedding models on chemical domain data. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "mteb/wikipedia_crystallography_analytical", + "revision": "b98f3205a68a9a50ab345abc85f01911089a93de", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", + adapted_from=["WikipediaCrystallographyAnalyticalClassification"], + ) diff --git a/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py b/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py index f1d97d3a70..9c476ce76f 100644 --- a/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py @@ -5,6 +5,7 @@ class WikipediaTheoreticalAppliedClassification(AbsTaskClassification): + superseded_by = "WikipediaTheoreticalAppliedClassification.v2" metadata = TaskMetadata( name="WikipediaTheoreticalAppliedClassification", description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", @@ -35,3 +36,38 @@ class WikipediaTheoreticalAppliedClassification(AbsTaskClassification): } """, ) + + +class WikipediaTheoreticalAppliedClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaTheoreticalAppliedClassification.v2", + description="""ChemTEB evaluates the performance of text embedding models on chemical domain data. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "mteb/wikipedia_theoretical_applied", + "revision": "9b984e9591b6c2d9291370b1bb233c2465d5bd2f", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", + adapted_from=["WikipediaTheoreticalAppliedClassification"], + ) diff --git a/mteb/tasks/Classification/eng/YahooAnswersTopicsClassification.py b/mteb/tasks/Classification/eng/YahooAnswersTopicsClassification.py index 7a699bfaaf..d00b732bca 100644 --- a/mteb/tasks/Classification/eng/YahooAnswersTopicsClassification.py +++ b/mteb/tasks/Classification/eng/YahooAnswersTopicsClassification.py @@ -5,6 +5,7 @@ class YahooAnswersTopicsClassification(AbsTaskClassification): + superseded_by = "YahooAnswersTopicsClassification.v2" metadata = TaskMetadata( name="YahooAnswersTopicsClassification", description="Dataset composed of questions and answers from Yahoo Answers, categorized into topics.", @@ -56,3 +57,50 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["train", "test"] ) + + +class YahooAnswersTopicsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="YahooAnswersTopicsClassification.v2", + description="""Dataset composed of questions and answers from Yahoo Answers, categorized into topics. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/yahoo_answers_topics", + dataset={ + "path": "mteb/yahoo_answers_topics", + "revision": "c4d89f9633025d50954ab98a4c2c2feb188f6279", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2022-01-25", "2022-01-25"), + domains=["Web", "Written"], + task_subtypes=["Topic classification"], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{NIPS2015_250cf8b5, + author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett}, + pages = {}, + publisher = {Curran Associates, Inc.}, + title = {Character-level Convolutional Networks for Text Classification}, + url = {https://proceedings.neurips.cc/paper_files/paper/2015/file/250cf8b51c773f3f8dc8b4be867a9a02-Paper.pdf}, + volume = {28}, + year = {2015}, +} +""", + adapted_from=["YahooAnswersTopicsClassification"], + ) + + samples_per_label = 32 + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["train", "test"] + ) diff --git a/mteb/tasks/Classification/eng/YelpReviewFullClassification.py b/mteb/tasks/Classification/eng/YelpReviewFullClassification.py index 2c088af31a..fb1ff45f89 100644 --- a/mteb/tasks/Classification/eng/YelpReviewFullClassification.py +++ b/mteb/tasks/Classification/eng/YelpReviewFullClassification.py @@ -5,6 +5,7 @@ class YelpReviewFullClassification(AbsTaskClassification): + superseded_by = "YelpReviewFullClassification.v2" metadata = TaskMetadata( name="YelpReviewFullClassification", description="Yelp Review Full is a dataset for sentiment analysis, containing 5 classes corresponding to ratings 1-5.", @@ -47,3 +48,50 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["test"] ) + + +class YelpReviewFullClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="YelpReviewFullClassification.v2", + description="""Yelp Review Full is a dataset for sentiment analysis, containing 5 classes corresponding to ratings 1-5. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://arxiv.org/abs/1509.01626", + dataset={ + "path": "mteb/yelp_review_full", + "revision": "49d71141934ae2e58733acd90908140e8ecaaee0", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2015-01-01", "2015-12-31"), # reviews from 2015 + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="https://huggingface.co/datasets/Yelp/yelp_review_full#licensing-information", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{NIPS2015_250cf8b5, + author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett}, + pages = {}, + publisher = {Curran Associates, Inc.}, + title = {Character-level Convolutional Networks for Text Classification}, + url = {https://proceedings.neurips.cc/paper_files/paper/2015/file/250cf8b51c773f3f8dc8b4be867a9a02-Paper.pdf}, + volume = {28}, + year = {2015}, +} +""", + adapted_from=["YelpReviewFullClassification"], + ) + + samples_per_label = 128 + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/mteb/tasks/Classification/est/estonian_valence.py b/mteb/tasks/Classification/est/estonian_valence.py index 11561aa385..ade49a0789 100644 --- a/mteb/tasks/Classification/est/estonian_valence.py +++ b/mteb/tasks/Classification/est/estonian_valence.py @@ -5,6 +5,7 @@ class EstonianValenceClassification(AbsTaskClassification): + superseded_by = "EstonianValenceClassification.v2" metadata = TaskMetadata( name="EstonianValenceClassification", dataset={ @@ -53,3 +54,45 @@ def dataset_transform(self): ) samples_per_label = 16 + + +class EstonianValenceClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="EstonianValenceClassification.v2", + dataset={ + "path": "mteb/estonian_valence", + "revision": "8795961e2af5b83bcb8a6928636845ac2b92f92e", + }, + description="""Dataset containing annotated Estonian news data from the Postimees and Õhtuleht newspapers. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://figshare.com/articles/dataset/Estonian_Valence_Corpus_Eesti_valentsikorpus/24517054", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["est-Latn"], + main_score="accuracy", + date=( + "1857-01-01", # Inception of Postimees + "2023-11-08", # dataset publication + ), + domains=["News", "Written"], + task_subtypes=["Sentiment/Hate speech"], + dialect=[], + license="cc-by-4.0", + annotations_creators="human-annotated", + sample_creation="found", + bibtex_citation=r""" +@article{Pajupuu2023, + author = {Hille Pajupuu and Jaan Pajupuu and Rene Altrov and Kairi Tamuri}, + doi = {10.6084/m9.figshare.24517054.v1}, + month = {11}, + title = {{Estonian Valence Corpus / Eesti valentsikorpus}}, + url = {https://figshare.com/articles/dataset/Estonian_Valence_Corpus_Eesti_valentsikorpus/24517054}, + year = {2023}, +} +""", + adapted_from=["EstonianValenceClassification"], + ) + + samples_per_label = 16 diff --git a/mteb/tasks/Classification/fas/FaMTEBClassification.py b/mteb/tasks/Classification/fas/FaMTEBClassification.py index ad94db9185..3eefd40cfb 100644 --- a/mteb/tasks/Classification/fas/FaMTEBClassification.py +++ b/mteb/tasks/Classification/fas/FaMTEBClassification.py @@ -437,6 +437,7 @@ class SynPerChatbotToneUserClassification(AbsTaskClassification): class SynPerTextToneClassification(AbsTaskClassification): + superseded_by = "SynPerTextToneClassification.v2" metadata = TaskMetadata( name="SynPerTextToneClassification", description="Persian Text Tone", @@ -463,7 +464,37 @@ class SynPerTextToneClassification(AbsTaskClassification): samples_per_label = 32 +class SynPerTextToneClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerTextToneClassification.v2", + description="""Persian Text Tone + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://mcinext.com/", + dataset={ + "path": "mteb/syn_per_text_tone", + "revision": "0ed7459db7e905714dc02cbe25b4eac55e91021e", + }, + type="Classification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + adapted_from=["SynPerTextToneClassification"], + ) + samples_per_label = 32 + + class SIDClassification(AbsTaskClassification): + superseded_by = "SIDClassification.v2" metadata = TaskMetadata( name="SIDClassification", description="SID Classification", @@ -490,7 +521,37 @@ class SIDClassification(AbsTaskClassification): samples_per_label = 32 +class SIDClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="SIDClassification.v2", + description="""SID Classification + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://mcinext.com/", + dataset={ + "path": "mteb/sid", + "revision": "8234b2081bd9ca33bdbc7bf68f5f9540fe3fd480", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Academic"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + adapted_from=["SIDClassification"], + ) + samples_per_label = 32 + + class DeepSentiPers(AbsTaskClassification): + superseded_by = "DeepSentiPers.v2" metadata = TaskMetadata( name="DeepSentiPers", description="Persian Sentiment Analysis Dataset", @@ -520,7 +581,37 @@ def dataset_transform(self): self.dataset = self.dataset.rename_column("review", "text") +class DeepSentiPersV2(AbsTaskClassification): + metadata = TaskMetadata( + name="DeepSentiPers.v2", + description="""Persian Sentiment Analysis Dataset + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://github.com/JoyeBright/DeepSentiPers", + dataset={ + "path": "mteb/deep_senti_pers", + "revision": "8d60d8315ac650ef0af32d68c4f92916ffc5cfb8", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Reviews"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + adapted_from=["DeepSentiPers"], + ) + samples_per_label = 32 + + class PersianTextEmotion(AbsTaskClassification): + superseded_by = "PersianTextEmotion.v2" metadata = TaskMetadata( name="PersianTextEmotion", description="Emotion is a Persian dataset with six basic emotions: anger, fear, joy, love, sadness, and surprise.", @@ -547,7 +638,37 @@ class PersianTextEmotion(AbsTaskClassification): samples_per_label = 32 +class PersianTextEmotionV2(AbsTaskClassification): + metadata = TaskMetadata( + name="PersianTextEmotion.v2", + description="""Emotion is a Persian dataset with six basic emotions: anger, fear, joy, love, sadness, and surprise. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/SeyedAli/Persian-Text-Emotion", + dataset={ + "path": "mteb/persian_text_emotion", + "revision": "a45594021eca1d1577296edc030d972a92ff26b3", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + adapted_from=["PersianTextEmotion"], + ) + samples_per_label = 32 + + class SentimentDKSF(AbsTaskClassification): + superseded_by = "SentimentDKSF.v2" metadata = TaskMetadata( name="SentimentDKSF", description="The Sentiment DKSF (Digikala/Snappfood comments) is a dataset for sentiment analysis.", @@ -574,7 +695,37 @@ class SentimentDKSF(AbsTaskClassification): samples_per_label = 32 +class SentimentDKSFV2(AbsTaskClassification): + metadata = TaskMetadata( + name="SentimentDKSF.v2", + description="""The Sentiment DKSF (Digikala/Snappfood comments) is a dataset for sentiment analysis. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://github.com/hezarai/hezar", + dataset={ + "path": "mteb/sentiment_dksf", + "revision": "05129fb229c8f68267d112cffa655f1312ec6575", + }, + type="Classification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Reviews"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + adapted_from=["SentimentDKSF"], + ) + samples_per_label = 32 + + class NLPTwitterAnalysisClassification(AbsTaskClassification): + superseded_by = "NLPTwitterAnalysisClassification.v2" metadata = TaskMetadata( name="NLPTwitterAnalysisClassification", description="Twitter Analysis Classification", @@ -604,6 +755,35 @@ def dataset_transform(self): self.dataset = self.dataset.rename_column("tweet", "text") +class NLPTwitterAnalysisClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="NLPTwitterAnalysisClassification.v2", + description="""Twitter Analysis Classification + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/hamedhf/nlp_twitter_analysis/tree/main", + dataset={ + "path": "mteb/nlp_twitter_analysis", + "revision": "41d85185019495609522fece20e93d11ab705301", + }, + type="Classification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Social"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + adapted_from=["NLPTwitterAnalysisClassification"], + ) + samples_per_label = 32 + + class DigikalamagClassification(AbsTaskClassification): metadata = TaskMetadata( name="DigikalamagClassification", diff --git a/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py b/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py index 5f4239d2fa..ce31bd0ad8 100644 --- a/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py +++ b/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py @@ -7,6 +7,7 @@ class FilipinoHateSpeechClassification(AbsTaskClassification): + superseded_by = "FilipinoHateSpeechClassification.v2" metadata = TaskMetadata( name="FilipinoHateSpeechClassification", description="Filipino Twitter dataset for sentiment classification.", @@ -40,3 +41,41 @@ class FilipinoHateSpeechClassification(AbsTaskClassification): } """, ) + + +class FilipinoHateSpeechClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="FilipinoHateSpeechClassification.v2", + description="""Filipino Twitter dataset for sentiment classification. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://pcj.csp.org.ph/index.php/pcj/issue/download/29/PCJ%20V14%20N1%20pp1-14%202019", + dataset={ + "path": "mteb/filipino_hate_speech", + "revision": "fdb7536c08d2f10d8b8b618c3ff00b0be286d844", + }, + type="Classification", + category="s2s", + modalities=["text"], + date=("2019-08-01", "2019-08-01"), + eval_splits=["validation", "test"], + eval_langs=["fil-Latn"], + main_score="accuracy", + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{Cabasag-2019-hate-speech, + author = {Neil Vicente Cabasag, Vicente Raphael Chan, Sean Christian Lim, Mark Edward Gonzales, and Charibeth Cheng}, + journal = {Philippine Computing Journal}, + month = {August}, + number = {1}, + title = {Hate speech in Philippine election-related tweets: Automatic detection and classification using natural language processing.}, + volume = {XIV}, + year = {2019}, +} +""", + adapted_from=["FilipinoHateSpeechClassification"], + ) diff --git a/mteb/tasks/Classification/fin/FinToxicityClassification.py b/mteb/tasks/Classification/fin/FinToxicityClassification.py index d847dac5a8..13e1206dce 100644 --- a/mteb/tasks/Classification/fin/FinToxicityClassification.py +++ b/mteb/tasks/Classification/fin/FinToxicityClassification.py @@ -5,6 +5,7 @@ class FinToxicityClassification(AbsTaskClassification): + superseded_by = "FinToxicityClassification.v2" metadata = TaskMetadata( name="FinToxicityClassification", description=""" @@ -53,6 +54,47 @@ def dataset_transform(self): if col not in ["text", "label"] ] self.dataset = self.dataset.remove_columns(remove_cols) - self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=["train", "test"] - ) + + +class FinToxicityClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="FinToxicityClassification.v2", + description=""" + This dataset is a DeepL -based machine translated version of the Jigsaw toxicity dataset for Finnish. The dataset is originally from a Kaggle competition https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data. + The original dataset poses a multi-label text classification problem and includes the labels identity_attack, insult, obscene, severe_toxicity, threat and toxicity. + Here adapted for toxicity classification, which is the most represented class. + + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + dataset={ + "path": "mteb/fin_toxicity", + "revision": "1deba6e874be1d5632a4ac0d1fb71f4bc3dea0d6", + }, + reference="https://aclanthology.org/2023.nodalida-1.68", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fin-Latn"], + main_score="f1", + date=("2023-03-13", "2023-09-25"), + domains=["News", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated", + bibtex_citation=r""" +@inproceedings{eskelinen-etal-2023-toxicity, + author = {Eskelinen, Anni and +Silvala, Laura and +Ginter, Filip and +Pyysalo, Sampo and +Laippala, Veronika}, + booktitle = {Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)}, + month = may, + title = {Toxicity Detection in {F}innish Using Machine Translation}, + year = {2023}, +} +""", + adapted_from=["FinToxicityClassification"], + ) diff --git a/mteb/tasks/Classification/fra/FrenchBookReviews.py b/mteb/tasks/Classification/fra/FrenchBookReviews.py index cb9c7b37c9..0af4947a35 100644 --- a/mteb/tasks/Classification/fra/FrenchBookReviews.py +++ b/mteb/tasks/Classification/fra/FrenchBookReviews.py @@ -5,6 +5,7 @@ class FrenchBookReviews(AbsTaskClassification): + superseded_by = "FrenchBookReviews.v2" metadata = TaskMetadata( name="FrenchBookReviews", dataset={ @@ -35,3 +36,37 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["train"] ) + + +class FrenchBookReviewsV2(AbsTaskClassification): + metadata = TaskMetadata( + name="FrenchBookReviews.v2", + dataset={ + "path": "mteb/french_book_reviews", + "revision": "71d755fd76073533c3d0c262f6b542eb0fa7ce96", + }, + description="""It is a French book reviews dataset containing a huge number of reader reviews on French books. Each review is pared with a rating that ranges from 0.5 to 5 (with 0.5 increment). + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/Abirate/french_book_reviews", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fra-Latn"], + main_score="accuracy", + date=("2022-01-01", "2023-01-01"), + domains=["Reviews", "Written"], + task_subtypes=[], + license="cc0-1.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" + """, + adapted_from=["FrenchBookReviews"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["train"] + ) diff --git a/mteb/tasks/Classification/fra/MovieReviewSentimentClassification.py b/mteb/tasks/Classification/fra/MovieReviewSentimentClassification.py index b488661093..04ff73b78d 100644 --- a/mteb/tasks/Classification/fra/MovieReviewSentimentClassification.py +++ b/mteb/tasks/Classification/fra/MovieReviewSentimentClassification.py @@ -5,6 +5,7 @@ class MovieReviewSentimentClassification(AbsTaskClassification): + superseded_by = "MovieReviewSentimentClassification.v2" metadata = TaskMetadata( name="MovieReviewSentimentClassification", dataset={ @@ -41,3 +42,43 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["validation", "test"] ) + + +class MovieReviewSentimentClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="MovieReviewSentimentClassification.v2", + dataset={ + "path": "mteb/movie_review_sentiment", + "revision": "4e182033cbfe75ae0556cd640d028986be82afd8", + }, + description="""The Allociné dataset is a French-language dataset for sentiment analysis that contains movie reviews produced by the online community of the Allociné.fr website. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://github.com/TheophileBlard/french-sentiment-analysis-with-bert", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["validation", "test"], + eval_langs=["fra-Latn"], + main_score="accuracy", + date=("2006-01-01", "2020-01-01"), + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="mit", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@software{blard2020, + author = {Théophile Blard}, + title = {French sentiment analysis with BERT}, + url = {https://github.com/TheophileBlard/french-sentiment-analysis-with-bert}, + year = {2020}, +} +""", + adapted_from=["MovieReviewSentimentClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["validation", "test"] + ) diff --git a/mteb/tasks/Classification/guj/GujaratiNewsClassification.py b/mteb/tasks/Classification/guj/GujaratiNewsClassification.py index 0c93c0dd21..e0714a2b00 100644 --- a/mteb/tasks/Classification/guj/GujaratiNewsClassification.py +++ b/mteb/tasks/Classification/guj/GujaratiNewsClassification.py @@ -5,6 +5,7 @@ class GujaratiNewsClassification(AbsTaskClassification): + superseded_by = "GujaratiNewsClassification.v2" metadata = TaskMetadata( name="GujaratiNewsClassification", description="A Gujarati dataset for 3-class classification of Gujarati news articles", @@ -31,3 +32,31 @@ class GujaratiNewsClassification(AbsTaskClassification): def dataset_transform(self): self.dataset = self.dataset.rename_column("headline", "text") + + +class GujaratiNewsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="GujaratiNewsClassification.v2", + description="""A Gujarati dataset for 3-class classification of Gujarati news articles + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://github.com/goru001/nlp-for-gujarati", + dataset={ + "path": "mteb/gujarati_news", + "revision": "a815d60b34dc8ee11910743d380b7d14a4e227cb", + }, + type="Classification", + category="s2s", + modalities=["text"], + date=("2014-01-01", "2018-01-01"), + eval_splits=["test"], + eval_langs=["guj-Gujr"], + main_score="accuracy", + domains=["News", "Written"], + task_subtypes=["Topic classification"], + license="mit", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation="", # none found, + adapted_from=["GujaratiNewsClassification"], + ) diff --git a/mteb/tasks/Classification/heb/HebrewSentimentAnalysis.py b/mteb/tasks/Classification/heb/HebrewSentimentAnalysis.py index 5f70955710..3bbdb698b3 100644 --- a/mteb/tasks/Classification/heb/HebrewSentimentAnalysis.py +++ b/mteb/tasks/Classification/heb/HebrewSentimentAnalysis.py @@ -7,6 +7,7 @@ class HebrewSentimentAnalysis(AbsTaskClassification): + superseded_by = "HebrewSentimentAnalysis.v2" metadata = TaskMetadata( name="HebrewSentimentAnalysis", dataset={ @@ -49,3 +50,44 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["test"] ) + + +class HebrewSentimentAnalysisV2(AbsTaskClassification): + metadata = TaskMetadata( + name="HebrewSentimentAnalysis.v2", + dataset={ + "path": "mteb/hebrew_sentiment_analysis", + "revision": "7ecd049fc8ac0d6f0a0121c8ff9fe44ea5bd935b", + "name": "morph", + }, + description="""HebrewSentiment is a data set consists of 12,804 user comments to posts on the official Facebook page of Israel’s president, Mr. Reuven Rivlin. In October 2015, we used the open software application Netvizz (Rieder, 2013) to scrape all the comments to all of the president’s posts in the period of June – August 2014, the first three months of Rivlin’s presidency.2 While the president’s posts aimed at reconciling tensions and called for tolerance and empathy, the sentiment expressed in the comments to the president’s posts was polarized between citizens who warmly thanked the president, and citizens that fiercely critiqued his policy. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/hebrew_sentiment", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["heb-Hebr"], + main_score="accuracy", + date=("2015-10-01", "2015-10-31"), + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="mit", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{amram-etal-2018-representations, + address = {Santa Fe, New Mexico, USA}, + author = {Amram, Adam and Ben David, Anat and Tsarfaty, Reut}, + booktitle = {Proceedings of the 27th International Conference on Computational Linguistics}, + month = aug, + pages = {2242--2252}, + publisher = {Association for Computational Linguistics}, + title = {Representations and Architectures in Neural Sentiment Analysis for Morphologically Rich Languages: A Case Study from {M}odern {H}ebrew}, + url = {https://www.aclweb.org/anthology/C18-1190}, + year = {2018}, +} +""", + adapted_from=["HebrewSentimentAnalysis"], + ) diff --git a/mteb/tasks/Classification/hin/HindiDiscourseClassification.py b/mteb/tasks/Classification/hin/HindiDiscourseClassification.py index 52fc83a720..1b72ce3133 100644 --- a/mteb/tasks/Classification/hin/HindiDiscourseClassification.py +++ b/mteb/tasks/Classification/hin/HindiDiscourseClassification.py @@ -5,6 +5,7 @@ class HindiDiscourseClassification(AbsTaskClassification): + superseded_by = "HindiDiscourseClassification.v2" metadata = TaskMetadata( name="HindiDiscourseClassification", dataset={ @@ -60,3 +61,59 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["train"] ) + + +class HindiDiscourseClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="HindiDiscourseClassification.v2", + dataset={ + "path": "mteb/hindi_discourse", + "revision": "9d10173a3df9858adc90711d8da9abf3df0a1259", + }, + description="""A Hindi Discourse dataset in Hindi with values for coherence. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://aclanthology.org/2020.lrec-1.149/", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["hin-Deva"], + main_score="accuracy", + date=("2019-12-01", "2020-04-09"), + domains=["Fiction", "Social", "Written"], + dialect=[], + task_subtypes=["Discourse coherence"], + license="mit", + annotations_creators="expert-annotated", + sample_creation="found", + bibtex_citation=r""" +@inproceedings{dhanwal-etal-2020-annotated, + address = {Marseille, France}, + author = {Dhanwal, Swapnil and +Dutta, Hritwik and +Nankani, Hitesh and +Shrivastava, Nilay and +Kumar, Yaman and +Li, Junyi Jessy and +Mahata, Debanjan and +Gosangi, Rakesh and +Zhang, Haimin and +Shah, Rajiv Ratn and +Stent, Amanda}, + booktitle = {Proceedings of the 12th Language Resources and Evaluation Conference}, + isbn = {979-10-95546-34-4}, + language = {English}, + month = may, + publisher = {European Language Resources Association}, + title = {An Annotated Dataset of Discourse Modes in {H}indi Stories}, + url = {https://www.aclweb.org/anthology/2020.lrec-1.149}, + year = {2020}, +} +""", + adapted_from=["HindiDiscourseClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["train"] + ) diff --git a/mteb/tasks/Classification/hin/SentimentAnalysisHindi.py b/mteb/tasks/Classification/hin/SentimentAnalysisHindi.py index c922567b8f..27625ff4b9 100644 --- a/mteb/tasks/Classification/hin/SentimentAnalysisHindi.py +++ b/mteb/tasks/Classification/hin/SentimentAnalysisHindi.py @@ -5,6 +5,7 @@ class SentimentAnalysisHindi(AbsTaskClassification): + superseded_by = "SentimentAnalysisHindi.v2" metadata = TaskMetadata( name="SentimentAnalysisHindi", description="Hindi Sentiment Analysis Dataset", @@ -42,3 +43,40 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["train"] ) + + +class SentimentAnalysisHindiV2(AbsTaskClassification): + metadata = TaskMetadata( + name="SentimentAnalysisHindi.v2", + description="""Hindi Sentiment Analysis Dataset + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/OdiaGenAI/sentiment_analysis_hindi", + dataset={ + "path": "mteb/sentiment_analysis_hindi", + "revision": "27fc099ce1c5a7295b9231e53a37648cdef6cb79", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["hin-Deva"], + main_score="f1", + date=("2023-09-15", "2023-10-16"), + dialect=[], + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + sample_creation="found", + bibtex_citation=r""" +@misc{OdiaGenAI, + author = {Shantipriya Parida and Sambit Sekhar and Soumendra Kumar Sahoo and Swateek Jena and Abhijeet Parida and Satya Ranjan Dash and Guneet Singh Kohli}, + howpublished = {{https://huggingface.co/OdiaGenAI}}, + journal = {Hugging Face repository}, + publisher = {Hugging Face}, + title = {OdiaGenAI: Generative AI and LLM Initiative for the Odia Language}, + year = {2023}, +} +""", + adapted_from=["SentimentAnalysisHindi"], + ) diff --git a/mteb/tasks/Classification/hrv/FrenkHrClassification.py b/mteb/tasks/Classification/hrv/FrenkHrClassification.py index 440e0c90ad..700c18f1e3 100644 --- a/mteb/tasks/Classification/hrv/FrenkHrClassification.py +++ b/mteb/tasks/Classification/hrv/FrenkHrClassification.py @@ -5,6 +5,7 @@ class FrenkHrClassification(AbsTaskClassification): + superseded_by = "FrenkHrClassification.v2" metadata = TaskMetadata( name="FrenkHrClassification", description="Croatian subset of the FRENK dataset", @@ -39,3 +40,41 @@ class FrenkHrClassification(AbsTaskClassification): } """, ) + + +class FrenkHrClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="FrenkHrClassification.v2", + description="""Croatian subset of the FRENK dataset + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + dataset={ + "path": "mteb/frenk_hr", + "revision": "09f90d0bee34d5e703caed26737166591a8f12b9", + }, + reference="https://arxiv.org/abs/1906.02045", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["hrv-Latn"], + main_score="accuracy", + date=("2021-05-28", "2021-05-28"), + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{ljubešić2019frenk, + archiveprefix = {arXiv}, + author = {Nikola Ljubešić and Darja Fišer and Tomaž Erjavec}, + eprint = {1906.02045}, + primaryclass = {cs.CL}, + title = {The FRENK Datasets of Socially Unacceptable Discourse in Slovene and English}, + url = {https://arxiv.org/abs/1906.02045}, + year = {2019}, +} +""", + adapted_from=["FrenkHrClassification"], + ) diff --git a/mteb/tasks/Classification/ind/IndonesianIdClickbaitClassification.py b/mteb/tasks/Classification/ind/IndonesianIdClickbaitClassification.py index adcfbd57df..8977120251 100644 --- a/mteb/tasks/Classification/ind/IndonesianIdClickbaitClassification.py +++ b/mteb/tasks/Classification/ind/IndonesianIdClickbaitClassification.py @@ -5,6 +5,7 @@ class IndonesianIdClickbaitClassification(AbsTaskClassification): + superseded_by = "IndonesianIdClickbaitClassification.v2" metadata = TaskMetadata( name="IndonesianIdClickbaitClassification", dataset={ @@ -50,3 +51,50 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["train"] ) + + +class IndonesianIdClickbaitClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="IndonesianIdClickbaitClassification.v2", + dataset={ + "path": "mteb/indonesian_id_clickbait", + "revision": "a54158a1b437a85e1982a70d0c57a69c69d0a5b8", + }, + description="""The CLICK-ID dataset is a collection of Indonesian news headlines that was collected from 12 local online news publishers. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="http://www.sciencedirect.com/science/article/pii/S2352340920311252", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["ind-Latn"], + main_score="f1", + date=("2020-10-01", "2020-10-01"), + domains=["News", "Written"], + dialect=[], + task_subtypes=["Claim verification"], + license="cc-by-4.0", + annotations_creators="expert-annotated", + sample_creation="found", + bibtex_citation=r""" +@article{WILLIAM2020106231, + abstract = {News analysis is a popular task in Natural Language Processing (NLP). In particular, the problem of clickbait in news analysis has gained attention in recent years [1, 2]. However, the majority of the tasks has been focused on English news, in which there is already a rich representative resource. For other languages, such as Indonesian, there is still a lack of resource for clickbait tasks. Therefore, we introduce the CLICK-ID dataset of Indonesian news headlines extracted from 12 Indonesian online news publishers. It is comprised of 15,000 annotated headlines with clickbait and non-clickbait labels. Using the CLICK-ID dataset, we then developed an Indonesian clickbait classification model achieving favourable performance. We believe that this corpus will be useful for replicable experiments in clickbait detection or other experiments in NLP areas.}, + author = {Andika William and Yunita Sari}, + doi = {https://doi.org/10.1016/j.dib.2020.106231}, + issn = {2352-3409}, + journal = {Data in Brief}, + keywords = {Indonesian, Natural Language Processing, News articles, Clickbait, Text-classification}, + pages = {106231}, + title = {CLICK-ID: A novel dataset for Indonesian clickbait headlines}, + url = {http://www.sciencedirect.com/science/article/pii/S2352340920311252}, + volume = {32}, + year = {2020}, +} +""", + adapted_from=["IndonesianIdClickbaitClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["train"] + ) diff --git a/mteb/tasks/Classification/ind/IndonesianMongabayConservationClassification.py b/mteb/tasks/Classification/ind/IndonesianMongabayConservationClassification.py index cef0f33fac..ac6e7002b8 100644 --- a/mteb/tasks/Classification/ind/IndonesianMongabayConservationClassification.py +++ b/mteb/tasks/Classification/ind/IndonesianMongabayConservationClassification.py @@ -10,6 +10,7 @@ class IndonesianMongabayConservationClassification(AbsTaskClassification): + superseded_by = "IndonesianMongabayConservationClassification.v2" metadata = TaskMetadata( name="IndonesianMongabayConservationClassification", description="Conservation dataset that was collected from mongabay.co.id contains topic-classification task (multi-label format) and sentiment classification. This task only covers sentiment analysis (positive, neutral negative)", @@ -100,3 +101,53 @@ def dataset_transform(self): ) self.dataset = datasets.DatasetDict(ds) + + +class IndonesianMongabayConservationClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="IndonesianMongabayConservationClassification.v2", + description="""Conservation dataset that was collected from mongabay.co.id contains topic-classification task (multi-label format) and sentiment classification. This task only covers sentiment analysis (positive, neutral negative) + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://aclanthology.org/2023.sealp-1.4/", + dataset={ + "path": "mteb/indonesian_mongabay_conservation", + "revision": "04863a3b6885470071f649a4d4dcd7e9d8e98cf8", + }, + type="Classification", + category="s2s", + modalities=["text"], + date=("2012-01-01", "2023-12-31"), + eval_splits=["validation", "test"], + eval_langs=["ind-Latn"], + main_score="f1", + domains=["Web", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{fransiska-etal-2023-utilizing, + address = {Nusa Dua, Bali, Indonesia}, + author = {Fransiska, Mega and +Pitaloka, Diah and +Saripudin, Saripudin and +Putra, Satrio and +Sutawika*, Lintang}, + booktitle = {Proceedings of the First Workshop in South East Asian Language Processing}, + doi = {10.18653/v1/2023.sealp-1.4}, + editor = {Wijaya, Derry and +Aji, Alham Fikri and +Vania, Clara and +Winata, Genta Indra and +Purwarianti, Ayu}, + month = nov, + pages = {30--54}, + publisher = {Association for Computational Linguistics}, + title = {Utilizing Weak Supervision to Generate {I}ndonesian Conservation Datasets}, + url = {https://aclanthology.org/2023.sealp-1.4}, + year = {2023}, +} +""", + adapted_from=["IndonesianMongabayConservationClassification"], + ) diff --git a/mteb/tasks/Classification/ita/ItalianLinguistAcceptabilityClassification.py b/mteb/tasks/Classification/ita/ItalianLinguistAcceptabilityClassification.py index db6f371494..66d34839f8 100644 --- a/mteb/tasks/Classification/ita/ItalianLinguistAcceptabilityClassification.py +++ b/mteb/tasks/Classification/ita/ItalianLinguistAcceptabilityClassification.py @@ -5,6 +5,7 @@ class ItalianLinguisticAcceptabilityClassification(AbsTaskClassification): + superseded_by = "Itacola.v2" metadata = TaskMetadata( name="Itacola", dataset={ @@ -52,3 +53,47 @@ def dataset_transform(self): .rename_columns({"sentence": "text"}) .remove_columns(["unique_id", "source"]) ) + + +class ItalianLinguisticAcceptabilityClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="Itacola.v2", + dataset={ + "path": "mteb/italian_linguistic_acceptability", + "revision": "4550151a0f0433e65df172c088427063e376ce81", + }, + description="""An Italian Corpus of Linguistic Acceptability taken from linguistic literature with a binary annotation made by the original authors themselves. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://aclanthology.org/2021.findings-emnlp.250/", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["ita-Latn"], + main_score="accuracy", + date=("2021-01-01", "2021-12-31"), + domains=["Non-fiction", "Spoken", "Written"], + dialect=[], + task_subtypes=["Linguistic acceptability"], + license="not specified", + annotations_creators="expert-annotated", + sample_creation="found", + bibtex_citation=r""" +@inproceedings{trotta-etal-2021-monolingual-cross, + address = {Punta Cana, Dominican Republic}, + author = {Trotta, Daniela and +Guarasci, Raffaele and +Leonardelli, Elisa and +Tonelli, Sara}, + booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2021}, + doi = {10.18653/v1/2021.findings-emnlp.250}, + month = nov, + pages = {2929--2940}, + publisher = {Association for Computational Linguistics}, + title = {Monolingual and Cross-Lingual Acceptability Judgments with the {I}talian {C}o{LA} corpus}, + url = {https://aclanthology.org/2021.findings-emnlp.250}, + year = {2021}, +} +""", + adapted_from=["ItalianLinguisticAcceptabilityClassification"], + ) diff --git a/mteb/tasks/Classification/jav/JavaneseIMDBClassification.py b/mteb/tasks/Classification/jav/JavaneseIMDBClassification.py index b0fa0144bd..2f9e613701 100644 --- a/mteb/tasks/Classification/jav/JavaneseIMDBClassification.py +++ b/mteb/tasks/Classification/jav/JavaneseIMDBClassification.py @@ -5,6 +5,7 @@ class JavaneseIMDBClassification(AbsTaskClassification): + superseded_by = "JavaneseIMDBClassification.v2" metadata = TaskMetadata( name="JavaneseIMDBClassification", description="Large Movie Review Dataset translated to Javanese. This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets.", @@ -43,3 +44,45 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["test"] ) + + +class JavaneseIMDBClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="JavaneseIMDBClassification.v2", + description="""Large Movie Review Dataset translated to Javanese. This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://github.com/w11wo/nlp-datasets#javanese-imdb", + dataset={ + "path": "mteb/javanese_imdb", + "revision": "47aadc77049fa4e7b9001c69a255555814d026d9", + }, + type="Classification", + category="s2s", + modalities=["text"], + date=("2021-06-24", "2021-06-24"), + eval_splits=["test"], + eval_langs=["jav-Latn"], + main_score="accuracy", + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="mit", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{wongso2021causal, + author = {Wongso, Wilson and Setiawan, David Samuel and Suhartono, Derwin}, + booktitle = {2021 International Conference on Advanced Computer Science and Information Systems (ICACSIS)}, + organization = {IEEE}, + pages = {1--7}, + title = {Causal and Masked Language Modeling of Javanese Language using Transformer-based Architectures}, + year = {2021}, +} +""", + adapted_from=["JavaneseIMDBClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/mteb/tasks/Classification/jpn/JapaneseSentimentClassification.py b/mteb/tasks/Classification/jpn/JapaneseSentimentClassification.py new file mode 100644 index 0000000000..a7c000ffab --- /dev/null +++ b/mteb/tasks/Classification/jpn/JapaneseSentimentClassification.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class JapaneseSentimentClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="JapaneseSentimentClassification", + dataset={ + "path": "sbintuitions/JMTEB", + "name": "japanese_sentiment_classification", + "revision": "6fe2ff3fab4a9ee7172e4cd5791600d9e2e7fde5", + "trust_remote_code": True, + }, + description="""Japanese sentiment classification dataset with binary + (positive vs negative sentiment) labels. This version reverts + the morphological analysis from the original multilingual dataset + to restore natural Japanese text without artificial spaces. + """, + reference="https://huggingface.co/datasets/mteb/multilingual-sentiment-classification", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["jpn-Jpan"], + main_score="accuracy", + date=("2022-08-01", "2022-08-01"), + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + adapted_from=["MultilingualSentimentClassification"], + bibtex_citation=r""" +@inproceedings{mollanorozy-etal-2023-cross, + address = {Dubrovnik, Croatia}, + author = {Mollanorozy, Sepideh and +Tanti, Marc and +Nissim, Malvina}, + booktitle = {Proceedings of the 5th Workshop on Research in Computational Linguistic Typology and Multilingual NLP}, + doi = {10.18653/v1/2023.sigtyp-1.9}, + editor = {Beinborn, Lisa and +Goswami, Koustava and +Murado{\\u{g}}lu, Saliha and +Sorokin, Alexey and +Shcherbakov, Andreas and +Ponti, Edoardo M. and +Cotterell, Ryan and +Vylomova, Ekaterina}, + month = may, + pages = {89--95}, + publisher = {Association for Computational Linguistics}, + title = {Cross-lingual Transfer Learning with \{P\}ersian}, + url = {https://aclanthology.org/2023.sigtyp-1.9}, + year = {2023}, +} +""", + ) diff --git a/mteb/tasks/Classification/jpn/WRIMEClassification.py b/mteb/tasks/Classification/jpn/WRIMEClassification.py index 893b092167..e4c5d8ec50 100644 --- a/mteb/tasks/Classification/jpn/WRIMEClassification.py +++ b/mteb/tasks/Classification/jpn/WRIMEClassification.py @@ -5,6 +5,7 @@ class WRIMEClassification(AbsTaskClassification): + superseded_by = "WRIMEClassification.v2" metadata = TaskMetadata( name="WRIMEClassification", dataset={ @@ -68,3 +69,64 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["test"] ) + + +class WRIMEClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="WRIMEClassification.v2", + dataset={ + "path": "mteb/wrime", + "revision": "6687c3bd031a0b144189958bad57db0b95a48dec", + "name": "ver2", + }, + description="""A dataset of Japanese social network rated for sentiment + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://aclanthology.org/2021.naacl-main.169/", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["jpn-Jpan"], + main_score="accuracy", + date=("2011-06-01", "2020-05-31"), + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="https://huggingface.co/datasets/shunk031/wrime#licensing-information", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{kajiwara-etal-2021-wrime, + abstract = {We annotate 17,000 SNS posts with both the writer{'}s subjective emotional intensity and the reader{'}s objective one to construct a Japanese emotion analysis dataset. In this study, we explore the difference between the emotional intensity of the writer and that of the readers with this dataset. We found that the reader cannot fully detect the emotions of the writer, especially anger and trust. In addition, experimental results in estimating the emotional intensity show that it is more difficult to estimate the writer{'}s subjective labels than the readers{'}. The large gap between the subjective and objective emotions imply the complexity of the mapping from a post to the subjective emotion intensities, which also leads to a lower performance with machine learning models.}, + address = {Online}, + author = {Kajiwara, Tomoyuki and +Chu, Chenhui and +Takemura, Noriko and +Nakashima, Yuta and +Nagahara, Hajime}, + booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, + doi = {10.18653/v1/2021.naacl-main.169}, + editor = {Toutanova, Kristina and +Rumshisky, Anna and +Zettlemoyer, Luke and +Hakkani-Tur, Dilek and +Beltagy, Iz and +Bethard, Steven and +Cotterell, Ryan and +Chakraborty, Tanmoy and +Zhou, Yichao}, + month = jun, + pages = {2095--2104}, + publisher = {Association for Computational Linguistics}, + title = {{WRIME}: A New Dataset for Emotional Intensity Estimation with Subjective and Objective Annotations}, + url = {https://aclanthology.org/2021.naacl-main.169}, + year = {2021}, +} +""", + adapted_from=["WRIMEClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/mteb/tasks/Classification/kan/KannadaNewsClassification.py b/mteb/tasks/Classification/kan/KannadaNewsClassification.py index 4d3edcc6ca..5350d63e4f 100644 --- a/mteb/tasks/Classification/kan/KannadaNewsClassification.py +++ b/mteb/tasks/Classification/kan/KannadaNewsClassification.py @@ -5,6 +5,7 @@ class KannadaNewsClassification(AbsTaskClassification): + superseded_by = "KannadaNewsClassification.v2" metadata = TaskMetadata( name="KannadaNewsClassification", description="The Kannada news dataset contains only the headlines of news article in three categories: Entertainment, Tech, and Sports. The data set contains around 6300 news article headlines which are collected from Kannada news websites. The data set has been cleaned and contains train and test set using which can be used to benchmark topic classification models in Kannada.", @@ -41,3 +42,43 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["train"] ) + + +class KannadaNewsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="KannadaNewsClassification.v2", + description="""The Kannada news dataset contains only the headlines of news article in three categories: Entertainment, Tech, and Sports. The data set contains around 6300 news article headlines which are collected from Kannada news websites. The data set has been cleaned and contains train and test set using which can be used to benchmark topic classification models in Kannada. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + dataset={ + "path": "mteb/kannada_news", + "revision": "bf9c88b5bd4e5b349a39492e5298a928ab509a92", + }, + reference="https://github.com/goru001/nlp-for-kannada", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["kan-Knda"], + main_score="accuracy", + date=("2019-03-17", "2020-08-06"), + domains=["News", "Written"], + task_subtypes=["Topic classification"], + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{kunchukuttan2020indicnlpcorpus, + author = {Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar}, + journal = {arXiv preprint arXiv:2005.00085}, + title = {AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages}, + year = {2020}, +} +""", + adapted_from=["KannadaNewsClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["train"] + ) diff --git a/mteb/tasks/Classification/kor/KlueTC.py b/mteb/tasks/Classification/kor/KlueTC.py index bf878570ac..8278f85684 100644 --- a/mteb/tasks/Classification/kor/KlueTC.py +++ b/mteb/tasks/Classification/kor/KlueTC.py @@ -5,6 +5,7 @@ class KlueTC(AbsTaskClassification): + superseded_by = "KLUE-TC.v2" metadata = TaskMetadata( name="KLUE-TC", dataset={ @@ -53,3 +54,46 @@ def id2str(example): {"title": "text", "label": "label_id"} ) self.dataset = self.dataset.map(id2str) + + +class KlueTCV2(AbsTaskClassification): + metadata = TaskMetadata( + name="KLUE-TC.v2", + dataset={ + "path": "mteb/klue_tc", + "name": "ynat", + "revision": "c0e3d82ac01def9bfd92dffb1e7dde619b50d0a2", + }, + description="""Topic classification dataset of human-annotated news headlines. Part of the Korean Language Understanding Evaluation (KLUE). + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://arxiv.org/abs/2105.09680", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["validation"], + eval_langs=["kor-Hang"], + main_score="accuracy", + date=("2016-01-01", "2020-12-31"), # from 2016 to 2020 + domains=["News", "Written"], + task_subtypes=["Topic classification"], + license="cc-by-sa-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{park2021klue, + archiveprefix = {arXiv}, + author = {Sungjoon Park and Jihyung Moon and Sungdong Kim and Won Ik Cho and Jiyoon Han and Jangwon Park and Chisung Song and Junseong Kim and Yongsook Song and Taehwan Oh and Joohong Lee and Juhyun Oh and Sungwon Lyu and Younghoon Jeong and Inkwon Lee and Sangwoo Seo and Dongjun Lee and Hyunwoo Kim and Myeonghwa Lee and Seongbo Jang and Seungwon Do and Sunkyoung Kim and Kyungtae Lim and Jongwon Lee and Kyumin Park and Jamin Shin and Seonghyun Kim and Lucy Park and Alice Oh and Jungwoo Ha and Kyunghyun Cho}, + eprint = {2105.09680}, + primaryclass = {cs.CL}, + title = {KLUE: Korean Language Understanding Evaluation}, + year = {2021}, +} +""", + adapted_from=["KlueTC"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["validation"] + ) diff --git a/mteb/tasks/Classification/kor/KorHateClassification.py b/mteb/tasks/Classification/kor/KorHateClassification.py index a9ec38fdef..d26a238fd3 100644 --- a/mteb/tasks/Classification/kor/KorHateClassification.py +++ b/mteb/tasks/Classification/kor/KorHateClassification.py @@ -5,6 +5,7 @@ class KorHateClassification(AbsTaskClassification): + superseded_by = "KorHateClassification.v2" metadata = TaskMetadata( name="KorHateClassification", description="""The dataset was created to provide the first human-labeled Korean corpus for @@ -57,3 +58,52 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["train"] ) + + +class KorHateClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="KorHateClassification.v2", + description="""The dataset was created to provide the first human-labeled Korean corpus for + toxic speech detection from a Korean online entertainment news aggregator. Recently, + two young Korean celebrities suffered from a series of tragic incidents that led to two + major Korean web portals to close the comments section on their platform. However, this only + serves as a temporary solution, and the fundamental issue has not been solved yet. This dataset + hopes to improve Korean hate speech detection. Annotation was performed by 32 annotators, + consisting of 29 annotators from the crowdsourcing platform DeepNatural AI and three NLP researchers. + + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + dataset={ + "path": "mteb/kor_hate", + "revision": "5d64e6dcbe9204c934e9a3852b1130a6f2d51ad4", + }, + reference="https://paperswithcode.com/dataset/korean-hatespeech-dataset", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["kor-Hang"], + main_score="accuracy", + date=("2018-01-01", "2020-01-01"), + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-sa-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{moon2020beep, + archiveprefix = {arXiv}, + author = {Jihyung Moon and Won Ik Cho and Junbum Lee}, + eprint = {2005.12503}, + primaryclass = {cs.CL}, + title = {BEEP! Korean Corpus of Online News Comments for Toxic Speech Detection}, + year = {2020}, +} +""", + adapted_from=["KorHateClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["train"] + ) diff --git a/mteb/tasks/Classification/kor/KorSarcasmClassification.py b/mteb/tasks/Classification/kor/KorSarcasmClassification.py index abae7c8222..0461db81c6 100644 --- a/mteb/tasks/Classification/kor/KorSarcasmClassification.py +++ b/mteb/tasks/Classification/kor/KorSarcasmClassification.py @@ -5,6 +5,7 @@ class KorSarcasmClassification(AbsTaskClassification): + superseded_by = "KorSarcasmClassification.v2" metadata = TaskMetadata( name="KorSarcasmClassification", description=""" @@ -51,3 +52,53 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["train"] ) + + +class KorSarcasmClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="KorSarcasmClassification.v2", + description=""" + The Korean Sarcasm Dataset was created to detect sarcasm in text, which can significantly alter the original + meaning of a sentence. 9319 tweets were collected from Twitter and labeled for sarcasm or not_sarcasm. These + tweets were gathered by querying for: irony sarcastic, and + sarcasm. + The dataset was created by gathering HTML data from Twitter. Queries for hashtags that include sarcasm + and variants of it were used to return tweets. It was preprocessed by removing the keyword + hashtag, urls and mentions of the user to preserve anonymity. + + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + dataset={ + "path": "mteb/kor_sarcasm", + "revision": "0e5e17b4dba569776e445f5639ba13dc406b2b0e", + }, + reference="https://github.com/SpellOnYou/korean-sarcasm", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["kor-Hang"], + main_score="accuracy", + date=("2018-10-31", "2019-09-28"), # estimated based on git history + domains=["Social", "Written"], + task_subtypes=["Topic classification"], + license="mit", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{kim2019kocasm, + author = {Kim, Jiwon and Cho, Won Ik}, + howpublished = {https://github.com/SpellOnYou/korean-sarcasm}, + journal = {GitHub repository}, + publisher = {GitHub}, + title = {Kocasm: Korean Automatic Sarcasm Detection}, + year = {2019}, +} +""", + adapted_from=["KorSarcasmClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["train"] + ) diff --git a/mteb/tasks/Classification/kur/KurdishSentimentClassification.py b/mteb/tasks/Classification/kur/KurdishSentimentClassification.py index 876b7450fd..d9d4bac595 100644 --- a/mteb/tasks/Classification/kur/KurdishSentimentClassification.py +++ b/mteb/tasks/Classification/kur/KurdishSentimentClassification.py @@ -5,6 +5,7 @@ class KurdishSentimentClassification(AbsTaskClassification): + superseded_by = "KurdishSentimentClassification.v2" metadata = TaskMetadata( name="KurdishSentimentClassification", description="Kurdish Sentiment Dataset", @@ -38,3 +39,41 @@ class KurdishSentimentClassification(AbsTaskClassification): } """, ) + + +class KurdishSentimentClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="KurdishSentimentClassification.v2", + description="""Kurdish Sentiment Dataset + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://link.springer.com/article/10.1007/s10579-023-09716-6", + dataset={ + "path": "mteb/kurdish_sentiment", + "revision": "f6b00b2a1fcbffd83f10a76c85f246ca750c83d2", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["kur-Arab"], + main_score="accuracy", + date=("2023-01-01", "2024-01-02"), + domains=["Web", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=["Sorani"], + sample_creation="found", + bibtex_citation=r""" +@article{article, + author = {Badawi, Soran and Kazemi, Arefeh and Rezaie, Vali}, + doi = {10.1007/s10579-023-09716-6}, + journal = {Language Resources and Evaluation}, + month = {01}, + pages = {1-20}, + title = {KurdiSent: a corpus for kurdish sentiment analysis}, + year = {2024}, +} +""", + adapted_from=["KurdishSentimentClassification"], + ) diff --git a/mteb/tasks/Classification/mal/MalayalamNewsClassification.py b/mteb/tasks/Classification/mal/MalayalamNewsClassification.py index 689e7688ac..991311b5cb 100644 --- a/mteb/tasks/Classification/mal/MalayalamNewsClassification.py +++ b/mteb/tasks/Classification/mal/MalayalamNewsClassification.py @@ -5,6 +5,7 @@ class MalayalamNewsClassification(AbsTaskClassification): + superseded_by = "MalayalamNewsClassification.v2" metadata = TaskMetadata( name="MalayalamNewsClassification", description="A Malayalam dataset for 3-class classification of Malayalam news articles", @@ -38,3 +39,38 @@ class MalayalamNewsClassification(AbsTaskClassification): def dataset_transform(self): self.dataset = self.dataset.rename_columns({"headings": "text"}) + + +class MalayalamNewsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="MalayalamNewsClassification.v2", + description="""A Malayalam dataset for 3-class classification of Malayalam news articles + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://github.com/goru001/nlp-for-malyalam", + dataset={ + "path": "mteb/malayalam_news", + "revision": "2bb37780ab4a68cb0b28a902059463563b2dbab9", + }, + type="Classification", + category="s2s", + modalities=["text"], + date=("2014-01-01", "2018-01-01"), + eval_splits=["test"], + eval_langs=["mal-Mlym"], + main_score="accuracy", + domains=["News", "Written"], + task_subtypes=["Topic classification"], + license="mit", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{kunchukuttan2020indicnlpcorpus, + author = {Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar}, + journal = {arXiv preprint arXiv:2005.00085}, + title = {AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages}, + year = {2020}, +} +""", + adapted_from=["MalayalamNewsClassification"], + ) diff --git a/mteb/tasks/Classification/mar/MarathiNewsClassification.py b/mteb/tasks/Classification/mar/MarathiNewsClassification.py index 4f652e2373..152561ed4e 100644 --- a/mteb/tasks/Classification/mar/MarathiNewsClassification.py +++ b/mteb/tasks/Classification/mar/MarathiNewsClassification.py @@ -5,6 +5,7 @@ class MarathiNewsClassification(AbsTaskClassification): + superseded_by = "MarathiNewsClassification.v2" metadata = TaskMetadata( name="MarathiNewsClassification", description="A Marathi dataset for 3-class classification of Marathi news articles", @@ -39,3 +40,38 @@ class MarathiNewsClassification(AbsTaskClassification): def dataset_transform(self): self.dataset = self.dataset.rename_columns({"headline": "text"}) self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed) + + +class MarathiNewsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="MarathiNewsClassification.v2", + description="""A Marathi dataset for 3-class classification of Marathi news articles + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://github.com/goru001/nlp-for-marathi", + dataset={ + "path": "mteb/marathi_news", + "revision": "97932a2f3b75d7bd9fae0d212975c1a1568935eb", + }, + type="Classification", + category="s2s", + modalities=["text"], + date=("2014-01-01", "2018-01-01"), + eval_splits=["test"], + eval_langs=["mar-Deva"], + main_score="f1", + domains=["News", "Written"], + task_subtypes=["Topic classification"], + license="mit", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{kunchukuttan2020indicnlpcorpus, + author = {Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar}, + journal = {arXiv preprint arXiv:2005.00085}, + title = {AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages}, + year = {2020}, +} +""", + adapted_from=["MarathiNewsClassification"], + ) diff --git a/mteb/tasks/Classification/mkd/MacedonianTweetSentimentClassification.py b/mteb/tasks/Classification/mkd/MacedonianTweetSentimentClassification.py index 3eb6b2dc81..b2283b172c 100644 --- a/mteb/tasks/Classification/mkd/MacedonianTweetSentimentClassification.py +++ b/mteb/tasks/Classification/mkd/MacedonianTweetSentimentClassification.py @@ -5,6 +5,7 @@ class MacedonianTweetSentimentClassification(AbsTaskClassification): + superseded_by = "MacedonianTweetSentimentClassification.v2" metadata = TaskMetadata( name="MacedonianTweetSentimentClassification", description="An Macedonian dataset for tweet sentiment classification.", @@ -45,3 +46,48 @@ class MacedonianTweetSentimentClassification(AbsTaskClassification): } """, ) + + +class MacedonianTweetSentimentClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="MacedonianTweetSentimentClassification.v2", + description="""An Macedonian dataset for tweet sentiment classification. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://aclanthology.org/R15-1034/", + dataset={ + "path": "mteb/macedonian_tweet_sentiment", + "revision": "3a8d98dc743809835255f727698d09814b699126", + }, + type="Classification", + category="s2s", + modalities=["text"], + date=["2014-11-01", "2015-04-01"], + eval_splits=["test"], + eval_langs=["mkd-Cyrl"], + main_score="accuracy", + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-nc-sa-3.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{jovanoski-etal-2015-sentiment, + address = {Hissar, Bulgaria}, + author = {Jovanoski, Dame and +Pachovski, Veno and +Nakov, Preslav}, + booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing}, + editor = {Mitkov, Ruslan and +Angelova, Galia and +Bontcheva, Kalina}, + month = sep, + pages = {249--257}, + publisher = {INCOMA Ltd. Shoumen, BULGARIA}, + title = {Sentiment Analysis in {T}witter for {M}acedonian}, + url = {https://aclanthology.org/R15-1034}, + year = {2015}, +} +""", + adapted_from=["MacedonianTweetSentimentClassification"], + ) diff --git a/mteb/tasks/Classification/multilingual/RuSciBenchClassification.py b/mteb/tasks/Classification/multilingual/RuSciBenchClassification.py new file mode 100644 index 0000000000..97d0f11b97 --- /dev/null +++ b/mteb/tasks/Classification/multilingual/RuSciBenchClassification.py @@ -0,0 +1,200 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class RuSciBenchCoreRiscClassification(MultilingualTask, AbsTaskClassification): + metadata = TaskMetadata( + name="RuSciBenchCoreRiscClassification", + dataset={ + "path": "mlsa-iai-msu-lab/ru_sci_bench_mteb", + "revision": "fbc0599a0b5f00b3c7d87ab4d13490f04fb77f8e", + }, + description="""This binary classification task aims to determine whether a scientific paper + (based on its title and abstract) belongs to the Core of the Russian Science Citation Index (RISC). + The RISC includes a wide range of publications, but the Core RISC comprises the most cited and prestigious + journals, dissertations, theses, monographs, and studies. The task is provided for both Russian and English + versions of the paper's title and abstract.""", + reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb", + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs={ + "corerisc_ru": ["rus-Cyrl"], + "corerisc_en": ["eng-Latn"], + }, + main_score="accuracy", + date=("2007-01-01", "2023-01-01"), + domains=["Academic", "Non-fiction", "Written"], + task_subtypes=[], + license="mit", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{vatolin2024ruscibench, + author = {Vatolin, A. and Gerasimenko, N. and Ianina, A. and Vorontsov, K.}, + doi = {10.1134/S1064562424602191}, + issn = {1531-8362}, + journal = {Doklady Mathematics}, + month = {12}, + number = {1}, + pages = {S251--S260}, + title = {RuSciBench: Open Benchmark for Russian and English Scientific Document Representations}, + url = {https://doi.org/10.1134/S1064562424602191}, + volume = {110}, + year = {2024}, +} +""", + prompt="Classify whether a scientific article is part of the core RISC or not based on the title and abstract", + ) + + +class RuSciBenchPubTypeClassification(MultilingualTask, AbsTaskClassification): + metadata = TaskMetadata( + name="RuSciBenchPubTypeClassification", + dataset={ + "path": "mlsa-iai-msu-lab/ru_sci_bench_mteb", + "revision": "fbc0599a0b5f00b3c7d87ab4d13490f04fb77f8e", + }, + description="""This task involves classifying scientific papers (based on their title and abstract) + into different publication types. The dataset identifies the following types: + 'Article', 'Conference proceedings', 'Survey', 'Miscellanea', 'Short message', 'Review', and 'Personalia'. + This task is available for both Russian and English versions of the paper's title and abstract.""", + reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb", + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs={ + "pub_type_ru": ["rus-Cyrl"], + "pub_type_en": ["eng-Latn"], + }, + main_score="accuracy", + date=("2007-01-01", "2023-01-01"), + domains=["Academic", "Non-fiction", "Written"], + task_subtypes=[], + license="mit", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{vatolin2024ruscibench, + author = {Vatolin, A. and Gerasimenko, N. and Ianina, A. and Vorontsov, K.}, + doi = {10.1134/S1064562424602191}, + issn = {1531-8362}, + journal = {Doklady Mathematics}, + month = {12}, + number = {1}, + pages = {S251--S260}, + title = {RuSciBench: Open Benchmark for Russian and English Scientific Document Representations}, + url = {https://doi.org/10.1134/S1064562424602191}, + volume = {110}, + year = {2024}, +} +""", + prompt="Classify the type of scientific paper based on the title and abstract", + ) + + +class RuSciBenchGRNTIClassificationV2(MultilingualTask, AbsTaskClassification): + metadata = TaskMetadata( + name="RuSciBenchGRNTIClassification.v2", + dataset={ + "path": "mlsa-iai-msu-lab/ru_sci_bench_mteb", + "revision": "fbc0599a0b5f00b3c7d87ab4d13490f04fb77f8e", + }, + description="""Classification of scientific papers based on the GRNTI (State Rubricator of Scientific and + Technical Information) rubricator. GRNTI is a universal hierarchical classification of knowledge domains + adopted in Russia and CIS countries to systematize the entire flow of scientific and technical information. + This task uses the first level of the GRNTI hierarchy and top 28 classes by frequency. + + In this version, English language support has been added and data partitioning has been slightly modified. + """, + reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb", + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs={ + "grnti_ru": ["rus-Cyrl"], + "grnti_en": ["eng-Latn"], + }, + main_score="accuracy", + date=("2007-01-01", "2023-01-01"), + domains=["Academic", "Non-fiction", "Written"], + task_subtypes=[], + license="mit", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{vatolin2024ruscibench, + author = {Vatolin, A. and Gerasimenko, N. and Ianina, A. and Vorontsov, K.}, + doi = {10.1134/S1064562424602191}, + issn = {1531-8362}, + journal = {Doklady Mathematics}, + month = {12}, + number = {1}, + pages = {S251--S260}, + title = {RuSciBench: Open Benchmark for Russian and English Scientific Document Representations}, + url = {https://doi.org/10.1134/S1064562424602191}, + volume = {110}, + year = {2024}, +} +""", + prompt="Classify the category of scientific papers based on the titles and abstracts", + ) + + +class RuSciBenchOECDClassificationV2(MultilingualTask, AbsTaskClassification): + metadata = TaskMetadata( + name="RuSciBenchOECDClassification.v2", + dataset={ + "path": "mlsa-iai-msu-lab/ru_sci_bench_mteb", + "revision": "fbc0599a0b5f00b3c7d87ab4d13490f04fb77f8e", + }, + description="""Classification of scientific papers based on the OECD + (Organization for Economic Co-operation and Development) rubricator. OECD provides + a hierarchical 3-level system of classes for labeling scientific articles. + This task uses the first two levels of the OECD hierarchy, top 29 classes. + + In this version, English language support has been added and data partitioning has been slightly modified. + """, + reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb", + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs={ + "oecd_ru": ["rus-Cyrl"], + "oecd_en": ["eng-Latn"], + }, + main_score="accuracy", + date=("2007-01-01", "2023-01-01"), + domains=["Academic", "Non-fiction", "Written"], + task_subtypes=[], + license="mit", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{vatolin2024ruscibench, + author = {Vatolin, A. and Gerasimenko, N. and Ianina, A. and Vorontsov, K.}, + doi = {10.1134/S1064562424602191}, + issn = {1531-8362}, + journal = {Doklady Mathematics}, + month = {12}, + number = {1}, + pages = {S251--S260}, + title = {RuSciBench: Open Benchmark for Russian and English Scientific Document Representations}, + url = {https://doi.org/10.1134/S1064562424602191}, + volume = {110}, + year = {2024}, +} +""", + prompt="Classify the category of scientific papers based on the titles and abstracts", + ) diff --git a/mteb/tasks/Classification/rus/ru_nlu_intent_classification.py b/mteb/tasks/Classification/multilingual/ru_nlu_intent_classification.py similarity index 100% rename from mteb/tasks/Classification/rus/ru_nlu_intent_classification.py rename to mteb/tasks/Classification/multilingual/ru_nlu_intent_classification.py diff --git a/mteb/tasks/Classification/mya/MyanmarNews.py b/mteb/tasks/Classification/mya/MyanmarNews.py index f0aed9a7c2..6c67154477 100644 --- a/mteb/tasks/Classification/mya/MyanmarNews.py +++ b/mteb/tasks/Classification/mya/MyanmarNews.py @@ -5,6 +5,7 @@ class MyanmarNews(AbsTaskClassification): + superseded_by = "MyanmarNews.v2" metadata = TaskMetadata( name="MyanmarNews", dataset={ @@ -37,3 +38,40 @@ class MyanmarNews(AbsTaskClassification): } """, ) + + +class MyanmarNewsV2(AbsTaskClassification): + metadata = TaskMetadata( + name="MyanmarNews.v2", + dataset={ + "path": "mteb/myanmar_news", + "revision": "475b43ffbdb5138ad67a01a2c860bc7db502f3c5", + }, + description="""The Myanmar News dataset on Hugging Face contains news articles in Burmese. It is designed for tasks such as text classification, sentiment analysis, and language modeling. The dataset includes a variety of news topics in 4 categorie, providing a rich resource for natural language processing applications involving Burmese which is a low resource language. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/myanmar_news", + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["mya-Mymr"], + main_score="accuracy", + date=("2017-10-01", "2017-10-31"), + domains=["News", "Written"], + task_subtypes=["Topic classification"], + license="gpl-3.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{Khine2017, + author = {A. H. Khine and K. T. Nwet and K. M. Soe}, + booktitle = {15th Proceedings of International Conference on Computer Applications}, + month = {February}, + pages = {401--408}, + title = {Automatic Myanmar News Classification}, + year = {2017}, +} +""", + adapted_from=["MyanmarNews"], + ) diff --git a/mteb/tasks/Classification/nep/NepaliNewsClassification.py b/mteb/tasks/Classification/nep/NepaliNewsClassification.py index d266e38a6b..010d39b1c7 100644 --- a/mteb/tasks/Classification/nep/NepaliNewsClassification.py +++ b/mteb/tasks/Classification/nep/NepaliNewsClassification.py @@ -5,6 +5,7 @@ class NepaliNewsClassification(AbsTaskClassification): + superseded_by = "NepaliNewsClassification.v2" metadata = TaskMetadata( name="NepaliNewsClassification", description="A Nepali dataset for 7500 news articles ", @@ -54,3 +55,56 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["train"] ) + + +class NepaliNewsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="NepaliNewsClassification.v2", + description="""A Nepali dataset for 7500 news articles + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://github.com/goru001/nlp-for-nepali", + dataset={ + "path": "mteb/nepali_news", + "revision": "1e5e6cd30972f05f0f21af38bd3a887714d41938", + }, + type="Classification", + category="s2s", + modalities=["text"], + date=("2019-01-01", "2020-01-01"), + eval_splits=["test"], + eval_langs=["nep-Deva"], + main_score="accuracy", + domains=["News", "Written"], + task_subtypes=["Topic classification"], + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{arora-2020-inltk, + abstract = {We present iNLTK, an open-source NLP library consisting of pre-trained language models and out-of-the-box support for Data Augmentation, Textual Similarity, Sentence Embeddings, Word Embeddings, Tokenization and Text Generation in 13 Indic Languages. By using pre-trained models from iNLTK for text classification on publicly available datasets, we significantly outperform previously reported results. On these datasets, we also show that by using pre-trained models and data augmentation from iNLTK, we can achieve more than 95{\%} of the previous best performance by using less than 10{\%} of the training data. iNLTK is already being widely used by the community and has 40,000+ downloads, 600+ stars and 100+ forks on GitHub.}, + address = {Online}, + author = {Arora, Gaurav}, + booktitle = {Proceedings of Second Workshop for NLP Open Source Software (NLP-OSS)}, + doi = {10.18653/v1/2020.nlposs-1.10}, + editor = {Park, Eunjeong L. and +Hagiwara, Masato and +Milajevs, Dmitrijs and +Liu, Nelson F. and +Chauhan, Geeticka and +Tan, Liling}, + month = nov, + pages = {66--71}, + publisher = {Association for Computational Linguistics}, + title = {i{NLTK}: Natural Language Toolkit for Indic Languages}, + url = {https://aclanthology.org/2020.nlposs-1.10}, + year = {2020}, +} +""", + adapted_from=["NepaliNewsClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["train"] + ) diff --git a/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py b/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py index 08c233b1ec..ed9236be3a 100644 --- a/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py +++ b/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py @@ -5,6 +5,7 @@ class DutchBookReviewSentimentClassification(AbsTaskClassification): + superseded_by = "DutchBookReviewSentimentClassification.v2" metadata = TaskMetadata( name="DutchBookReviewSentimentClassification", description="A Dutch book review for sentiment classification.", @@ -44,3 +45,47 @@ class DutchBookReviewSentimentClassification(AbsTaskClassification): } """, ) + + +class DutchBookReviewSentimentClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="DutchBookReviewSentimentClassification.v2", + description="""A Dutch book review for sentiment classification. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://github.com/benjaminvdb/DBRD", + dataset={ + "path": "mteb/dutch_book_review_sentiment", + "revision": "73cffedb578b628588db03b8608880cf95688bb2", + }, + type="Classification", + category="s2s", + modalities=["text"], + date=("2019-10-04", "2019-10-04"), + eval_splits=["test"], + eval_langs=["nld-Latn"], + main_score="accuracy", + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{DBLP:journals/corr/abs-1910-00896, + archiveprefix = {arXiv}, + author = {Benjamin, van der Burgh and +Suzan, Verberne}, + bibsource = {dblp computer science bibliography, https://dblp.org}, + biburl = {https://dblp.org/rec/journals/corr/abs-1910-00896.bib}, + eprint = {1910.00896}, + journal = {CoRR}, + timestamp = {Fri, 04 Oct 2019 12:28:06 +0200}, + title = {The merits of Universal Language Model Fine-tuning for Small Datasets +- a case with Dutch book reviews}, + url = {http://arxiv.org/abs/1910.00896}, + volume = {abs/1910.00896}, + year = {2019}, +} +""", + adapted_from=["DutchBookReviewSentimentClassification"], + ) diff --git a/mteb/tasks/Classification/nob/NoRecClassification.py b/mteb/tasks/Classification/nob/NoRecClassification.py index 3dfa084b0e..cdba137eb2 100644 --- a/mteb/tasks/Classification/nob/NoRecClassification.py +++ b/mteb/tasks/Classification/nob/NoRecClassification.py @@ -5,6 +5,7 @@ class NoRecClassification(AbsTaskClassification): + superseded_by = "NoRecClassification.v2" metadata = TaskMetadata( name="NoRecClassification", description="A Norwegian dataset for sentiment classification on review", @@ -60,3 +61,63 @@ class NoRecClassification(AbsTaskClassification): """, prompt="Classify Norwegian reviews by sentiment", ) + + +class NoRecClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="NoRecClassification.v2", + description="""A Norwegian dataset for sentiment classification on review + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://aclanthology.org/L18-1661/", + dataset={ + # using the mini version to keep results ~comparable to the ScandEval benchmark + "path": "mteb/no_rec", + "revision": "10aae1fb3fe2c19888bd4ea11695bbf19aa8bed3", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["nob-Latn"], + main_score="accuracy", + date=("1998-01-01", "2018-01-01"), # based on plot in paper + domains=["Written", "Reviews"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-nc-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{velldal-etal-2018-norec, + address = {Miyazaki, Japan}, + author = {Velldal, Erik and +{\\O}vrelid, Lilja and +Bergem, Eivind Alexander and +Stadsnes, Cathrine and +Touileb, Samia and +J{\\o}rgensen, Fredrik}, + booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)}, + editor = {Calzolari, Nicoletta and +Choukri, Khalid and +Cieri, Christopher and +Declerck, Thierry and +Goggi, Sara and +Hasida, Koiti and +Isahara, Hitoshi and +Maegaard, Bente and +Mariani, Joseph and +Mazo, H{\\'e}l{\\`e}ne and +Moreno, Asuncion and +Odijk, Jan and +Piperidis, Stelios and +Tokunaga, Takenobu}, + month = may, + publisher = {European Language Resources Association (ELRA)}, + title = {{N}o{R}e{C}: The {N}orwegian Review Corpus}, + url = {https://aclanthology.org/L18-1661}, + year = {2018}, +} +""", + prompt="Classify Norwegian reviews by sentiment", + adapted_from=["NoRecClassification"], + ) diff --git a/mteb/tasks/Classification/nob/NorwegianParliamentClassification.py b/mteb/tasks/Classification/nob/NorwegianParliamentClassification.py index b91c704063..03fbf0c381 100644 --- a/mteb/tasks/Classification/nob/NorwegianParliamentClassification.py +++ b/mteb/tasks/Classification/nob/NorwegianParliamentClassification.py @@ -5,6 +5,7 @@ class NorwegianParliamentClassification(AbsTaskClassification): + superseded_by = "NorwegianParliamentClassification.v2" metadata = TaskMetadata( name="NorwegianParliamentClassification", description="Norwegian parliament speeches annotated for sentiment", @@ -49,3 +50,51 @@ class NorwegianParliamentClassification(AbsTaskClassification): """, prompt="Classify parliament speeches in Norwegian based on political affiliation", ) + + +class NorwegianParliamentClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="NorwegianParliamentClassification.v2", + description="""Norwegian parliament speeches annotated for sentiment + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/NbAiLab/norwegian_parliament", + dataset={ + "path": "mteb/norwegian_parliament", + "revision": "7f2012a878e67486ac871cb450d6ef0dc2ebed7f", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test", "validation"], + eval_langs=["nob-Latn"], + # assumed to be bokmål + main_score="accuracy", + date=("1999-01-01", "2016-01-01"), # based on dates within the dataset + domains=["Government", "Spoken"], + task_subtypes=["Political classification"], + license="cc-by-4.0", + annotations_creators="derived", # based on the speaker affiliation + dialect=[], # unknown + sample_creation="found", + bibtex_citation=r""" +@inproceedings{kummervold-etal-2021-operationalizing, + abstract = {In this work, we show the process of building a large-scale training set from digital and digitized collections at a national library. The resulting Bidirectional Encoder Representations from Transformers (BERT)-based language model for Norwegian outperforms multilingual BERT (mBERT) models in several token and sequence classification tasks for both Norwegian Bokm{\aa}l and Norwegian Nynorsk. Our model also improves the mBERT performance for other languages present in the corpus such as English, Swedish, and Danish. For languages not included in the corpus, the weights degrade moderately while keeping strong multilingual properties. Therefore, we show that building high-quality models within a memory institution using somewhat noisy optical character recognition (OCR) content is feasible, and we hope to pave the way for other memory institutions to follow.}, + address = {Reykjavik, Iceland (Online)}, + author = {Kummervold, Per E and +De la Rosa, Javier and +Wetjen, Freddy and +Brygfjeld, Svein Arne}, + booktitle = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)}, + editor = {Dobnik, Simon and +{\O}vrelid, Lilja}, + month = may # { 31--2 } # jun, + pages = {20--29}, + publisher = {Link{\"o}ping University Electronic Press, Sweden}, + title = {Operationalizing a National Digital Library: The Case for a {N}orwegian Transformer Model}, + url = {https://aclanthology.org/2021.nodalida-main.3}, + year = {2021}, +} +""", + prompt="Classify parliament speeches in Norwegian based on political affiliation", + adapted_from=["NorwegianParliamentClassification"], + ) diff --git a/mteb/tasks/Classification/ory/OdiaNewsClassification.py b/mteb/tasks/Classification/ory/OdiaNewsClassification.py index 214b8f67f3..dab2f279a8 100644 --- a/mteb/tasks/Classification/ory/OdiaNewsClassification.py +++ b/mteb/tasks/Classification/ory/OdiaNewsClassification.py @@ -5,6 +5,7 @@ class OdiaNewsClassification(AbsTaskClassification): + superseded_by = "OdiaNewsClassification.v2" metadata = TaskMetadata( name="OdiaNewsClassification", description="A Odia dataset for 3-class classification of Odia news articles", @@ -39,3 +40,41 @@ class OdiaNewsClassification(AbsTaskClassification): def dataset_transform(self): self.dataset = self.dataset.rename_columns({"headings": "text"}) self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed) + + +class OdiaNewsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="OdiaNewsClassification.v2", + description="""A Odia dataset for 3-class classification of Odia news articles + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://github.com/goru001/nlp-for-odia", + dataset={ + "path": "mteb/odia_news", + "revision": "a594be3144fdf15c1a7efcb0aa7484cbb9a8dba3", + }, + type="Classification", + category="s2s", + modalities=["text"], + date=("2014-01-01", "2018-01-01"), + eval_splits=["test"], + eval_langs=["ory-Orya"], + main_score="f1", + domains=["News", "Written"], + task_subtypes=["Topic classification"], + license="mit", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{kunchukuttan2020indicnlpcorpus, + author = {Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar}, + journal = {arXiv preprint arXiv:2005.00085}, + title = {AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages}, + year = {2020}, +} +""", + adapted_from=["OdiaNewsClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed) diff --git a/mteb/tasks/Classification/pol/PolishClassification.py b/mteb/tasks/Classification/pol/PolishClassification.py index d75a37a362..ad63a10da5 100644 --- a/mteb/tasks/Classification/pol/PolishClassification.py +++ b/mteb/tasks/Classification/pol/PolishClassification.py @@ -5,6 +5,7 @@ class CbdClassification(AbsTaskClassification): + superseded_by = "CBD.v2" metadata = TaskMetadata( name="CBD", description="Polish Tweets annotated for cyberbullying detection.", @@ -40,7 +41,46 @@ class CbdClassification(AbsTaskClassification): ) +class CbdClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="CBD.v2", + description="""Polish Tweets annotated for cyberbullying detection. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="http://2019.poleval.pl/files/poleval2019.pdf", + dataset={ + "path": "mteb/cbd", + "revision": "d962699e284a173179a05052b49d0a9001a25bc0", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["pol-Latn"], + main_score="accuracy", + date=("2019-01-01", "2019-12-31"), # best guess: based on publication date + domains=["Written", "Social"], + task_subtypes=["Sentiment/Hate speech"], + license="bsd-3-clause", + annotations_creators="human-annotated", # guess + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@proceedings{ogr:kob:19:poleval, + address = {Warsaw, Poland}, + editor = {Maciej Ogrodniczuk and Łukasz Kobyliński}, + isbn = {978-83-63159-28-3}, + publisher = {Institute of Computer Science, Polish Academy of Sciences}, + title = {{Proceedings of the PolEval 2019 Workshop}}, + url = {http://2019.poleval.pl/files/poleval2019.pdf}, + year = {2019}, +} +""", + adapted_from=["CbdClassification"], + ) + + class PolEmo2InClassification(AbsTaskClassification): + superseded_by = "PolEmo2.0-IN.v2" metadata = TaskMetadata( name="PolEmo2.0-IN", description="A collection of Polish online reviews from four domains: medicine, hotels, products and " @@ -83,7 +123,52 @@ class PolEmo2InClassification(AbsTaskClassification): ) +class PolEmo2InClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="PolEmo2.0-IN.v2", + description="A collection of Polish online reviews from four domains: medicine, hotels, products and " + + "school. The PolEmo2.0-IN task is to predict the sentiment of in-domain (medicine and hotels) reviews.", + reference="https://aclanthology.org/K19-1092.pdf", + dataset={ + "path": "mteb/pol_emo2_in", + "revision": "15f86f0432cd7c91437cf7c673993527e2f53fd8", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["pol-Latn"], + main_score="accuracy", + date=("2004-01-01", "2019-05-30"), # based on plot in paper + domains=["Written", "Social"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{kocon-etal-2019-multi, + abstract = {In this article we present an extended version of PolEmo {--} a corpus of consumer reviews from 4 domains: medicine, hotels, products and school. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and sentence was manually annotated with sentiment in 2+1 scheme, which gives a total of 197,046 annotations. We obtained a high value of Positive Specific Agreement, which is 0.91 for texts and 0.88 for sentences. PolEmo 2.0 is publicly available under a Creative Commons copyright license. We explored recent deep learning approaches for the recognition of sentiment, such as Bi-directional Long Short-Term Memory (BiLSTM) and Bidirectional Encoder Representations from Transformers (BERT).}, + address = {Hong Kong, China}, + author = {Koco{\'n}, Jan and +Mi{\l}kowski, Piotr and +Za{\'s}ko-Zieli{\'n}ska, Monika}, + booktitle = {Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)}, + doi = {10.18653/v1/K19-1092}, + month = nov, + pages = {980--991}, + publisher = {Association for Computational Linguistics}, + title = {Multi-Level Sentiment Analysis of {P}ol{E}mo 2.0: Extended Corpus of Multi-Domain Consumer Reviews}, + url = {https://aclanthology.org/K19-1092}, + year = {2019}, +} +""", + adapted_from=["PolEmo2InClassification"], + ) + + class PolEmo2OutClassification(AbsTaskClassification): + superseded_by = "PolEmo2.0-OUT.v2" metadata = TaskMetadata( name="PolEmo2.0-OUT", description="A collection of Polish online reviews from four domains: medicine, hotels, products and " @@ -111,7 +196,37 @@ class PolEmo2OutClassification(AbsTaskClassification): ) +class PolEmo2OutClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="PolEmo2.0-OUT.v2", + description="A collection of Polish online reviews from four domains: medicine, hotels, products and " + + "school. The PolEmo2.0-OUT task is to predict the sentiment of out-of-domain (products and " + + "school) reviews using models train on reviews from medicine and hotels domains.", + reference="https://aclanthology.org/K19-1092.pdf", + dataset={ + "path": "mteb/pol_emo2_out", + "revision": "f7f3752b56dcbc4c84077274dfa687efa38476fb", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["pol-Latn"], + main_score="accuracy", + date=("2004-01-01", "2019-05-30"), # based on plot in paper + domains=["Written", "Social"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-sa-4.0", + annotations_creators=None, + dialect=None, + sample_creation=None, + bibtex_citation=None, + adapted_from=["PolEmo2OutClassification"], + ) + + class AllegroReviewsClassification(AbsTaskClassification): + superseded_by = "AllegroReviews.v2" metadata = TaskMetadata( name="AllegroReviews", description="A Polish dataset for sentiment classification on reviews from e-commerce marketplace Allegro.", @@ -161,7 +276,60 @@ class AllegroReviewsClassification(AbsTaskClassification): ) +class AllegroReviewsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="AllegroReviews.v2", + description="""A Polish dataset for sentiment classification on reviews from e-commerce marketplace Allegro. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://aclanthology.org/2020.acl-main.111.pdf", + dataset={ + "path": "mteb/allegro_reviews", + "revision": "5233456d195235bf93f45b8ef54d72f72957dbf1", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["pol-Latn"], + main_score="accuracy", + date=( + "2020-06-22", + "2020-07-07", + ), # best guess: based on commit dates in https://github.com/allegro/klejbenchmark-baselines + domains=["Reviews"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{rybak-etal-2020-klej, + abstract = {In recent years, a series of Transformer-based models unlocked major improvements in general natural language understanding (NLU) tasks. Such a fast pace of research would not be possible without general NLU benchmarks, which allow for a fair comparison of the proposed methods. However, such benchmarks are available only for a handful of languages. To alleviate this issue, we introduce a comprehensive multi-task benchmark for the Polish language understanding, accompanied by an online leaderboard. It consists of a diverse set of tasks, adopted from existing datasets for named entity recognition, question-answering, textual entailment, and others. We also introduce a new sentiment analysis task for the e-commerce domain, named Allegro Reviews (AR). To ensure a common evaluation scheme and promote models that generalize to different NLU tasks, the benchmark includes datasets from varying domains and applications. Additionally, we release HerBERT, a Transformer-based model trained specifically for the Polish language, which has the best average performance and obtains the best results for three out of nine tasks. Finally, we provide an extensive evaluation, including several standard baselines and recently proposed, multilingual Transformer-based models.}, + address = {Online}, + author = {Rybak, Piotr and +Mroczkowski, Robert and +Tracz, Janusz and +Gawlik, Ireneusz}, + booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics}, + doi = {10.18653/v1/2020.acl-main.111}, + editor = {Jurafsky, Dan and +Chai, Joyce and +Schluter, Natalie and +Tetreault, Joel}, + month = jul, + pages = {1191--1201}, + publisher = {Association for Computational Linguistics}, + title = {{KLEJ}: Comprehensive Benchmark for {P}olish Language Understanding}, + url = {https://aclanthology.org/2020.acl-main.111/}, + year = {2020}, +} +""", + adapted_from=["AllegroReviewsClassification"], + ) + + class PacClassification(AbsTaskClassification): + superseded_by = "PAC.v2" metadata = TaskMetadata( name="PAC", description="Polish Paraphrase Corpus", @@ -196,3 +364,41 @@ class PacClassification(AbsTaskClassification): } """, ) + + +class PacClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="PAC.v2", + description="""Polish Paraphrase Corpus + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://arxiv.org/pdf/2211.13112.pdf", + dataset={ + "path": "mteb/pac", + "revision": "53c98e6a9173c550f1b60f0da9152e67e9618897", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["pol-Latn"], + main_score="accuracy", + date=("2021-01-01", "2021-12-31"), # best guess: based on publication date + domains=["Legal", "Written"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators=None, + dialect=[], + sample_creation=None, + bibtex_citation=r""" +@misc{augustyniak2022waydesigningcompilinglepiszcze, + archiveprefix = {arXiv}, + author = {Łukasz Augustyniak and Kamil Tagowski and Albert Sawczyn and Denis Janiak and Roman Bartusiak and Adrian Szymczak and Marcin Wątroba and Arkadiusz Janz and Piotr Szymański and Mikołaj Morzy and Tomasz Kajdanowicz and Maciej Piasecki}, + eprint = {2211.13112}, + primaryclass = {cs.CL}, + title = {This is the way: designing and compiling LEPISZCZE, a comprehensive NLP benchmark for Polish}, + url = {https://arxiv.org/abs/2211.13112}, + year = {2022}, +} +""", + adapted_from=["PacClassification"], + ) diff --git a/mteb/tasks/Classification/ron/Moroco.py b/mteb/tasks/Classification/ron/Moroco.py index 479dd5a1d4..c8114a3c82 100644 --- a/mteb/tasks/Classification/ron/Moroco.py +++ b/mteb/tasks/Classification/ron/Moroco.py @@ -7,6 +7,7 @@ class Moroco(AbsTaskClassification): + superseded_by = "Moroco.v2" metadata = TaskMetadata( name="Moroco", dataset={ @@ -50,3 +51,47 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["test"] ) + + +class MorocoV2(AbsTaskClassification): + metadata = TaskMetadata( + name="Moroco.v2", + dataset={ + "path": "mteb/moroco", + "revision": "6e70588dbd3d583da8b85989c1c3ab3d4bd2e7c4", + }, + description="""The Moldavian and Romanian Dialectal Corpus. The MOROCO data set contains Moldavian and Romanian samples of text collected from the news domain. The samples belong to one of the following six topics: (0) culture, (1) finance, (2) politics, (3) science, (4) sports, (5) tech + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/moroco", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["ron-Latn"], + main_score="accuracy", + date=("2017-10-01", "2017-10-31"), + domains=["News", "Written"], + task_subtypes=["Topic classification"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[ + "ron-Latn-ron", + "ron-Latn-mol", + ], # Moldavian, or the Romanian dialect used in Moldova, does not have an ISO 639-1 code assigned to it. However, it has been given the three-letter code "mol" under ISO 639-3 + sample_creation="found", + bibtex_citation=r""" +@inproceedings{Butnaru-ACL-2019, + author = {Andrei M. Butnaru and Radu Tudor Ionescu}, + booktitle = {Proceedings of ACL}, + pages = {688--698}, + title = {{MOROCO: The Moldavian and Romanian Dialectal Corpus}}, + year = {2019}, +} +""", + adapted_from=["Moroco"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/mteb/tasks/Classification/ron/RomanianReviewsSentiment.py b/mteb/tasks/Classification/ron/RomanianReviewsSentiment.py index be06de0fa5..fd39716cc7 100644 --- a/mteb/tasks/Classification/ron/RomanianReviewsSentiment.py +++ b/mteb/tasks/Classification/ron/RomanianReviewsSentiment.py @@ -5,6 +5,7 @@ class RomanianReviewsSentiment(AbsTaskClassification): + superseded_by = "RomanianReviewsSentiment.v2" metadata = TaskMetadata( name="RomanianReviewsSentiment", description="LaRoSeDa (A Large Romanian Sentiment Data Set) contains 15,000 reviews written in Romanian", @@ -44,3 +45,43 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["test"] ) + + +class RomanianReviewsSentimentV2(AbsTaskClassification): + metadata = TaskMetadata( + name="RomanianReviewsSentiment.v2", + description="""LaRoSeDa (A Large Romanian Sentiment Data Set) contains 15,000 reviews written in Romanian + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://arxiv.org/abs/2101.04197", + dataset={ + "path": "mteb/romanian_reviews_sentiment", + "revision": "6b320d55fcf5fc184a9e7cc828debb34f7949432", + }, + type="Classification", + category="s2s", + modalities=["text"], + date=("2020-01-01", "2021-01-11"), + eval_splits=["test"], + eval_langs=["ron-Latn"], + main_score="accuracy", + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{tache2101clustering, + author = {Anca Maria Tache and Mihaela Gaman and Radu Tudor Ionescu}, + journal = {ArXiv}, + title = {Clustering Word Embeddings with Self-Organizing Maps. Application on LaRoSeDa -- A Large Romanian Sentiment Data Set}, + year = {2021}, +} +""", + adapted_from=["RomanianReviewsSentiment"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/mteb/tasks/Classification/ron/RomanianSentimentClassification.py b/mteb/tasks/Classification/ron/RomanianSentimentClassification.py index 3622620d50..9375a62dc3 100644 --- a/mteb/tasks/Classification/ron/RomanianSentimentClassification.py +++ b/mteb/tasks/Classification/ron/RomanianSentimentClassification.py @@ -7,6 +7,7 @@ class RomanianSentimentClassification(AbsTaskClassification): + superseded_by = "RomanianSentimentClassification.v2" metadata = TaskMetadata( name="RomanianSentimentClassification", description="An Romanian dataset for sentiment classification.", @@ -44,3 +45,43 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["test"] ) + + +class RomanianSentimentClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="RomanianSentimentClassification.v2", + description="""An Romanian dataset for sentiment classification. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://arxiv.org/abs/2009.08712", + dataset={ + "path": "mteb/romanian_sentiment", + "revision": "bf545b83db13cf73ed402749b21a7777e0afdc6a", + }, + type="Classification", + category="s2s", + modalities=["text"], + date=("2020-09-18", "2020-09-18"), + eval_splits=["test"], + eval_langs=["ron-Latn"], + main_score="accuracy", + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{dumitrescu2020birth, + author = {Dumitrescu, Stefan Daniel and Avram, Andrei-Marius and Pyysalo, Sampo}, + journal = {arXiv preprint arXiv:2009.08712}, + title = {The birth of Romanian BERT}, + year = {2020}, +} +""", + adapted_from=["RomanianSentimentClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/mteb/tasks/Classification/rus/GeoreviewClassification.py b/mteb/tasks/Classification/rus/GeoreviewClassification.py index 3a9298ead2..f5f9efb1e3 100644 --- a/mteb/tasks/Classification/rus/GeoreviewClassification.py +++ b/mteb/tasks/Classification/rus/GeoreviewClassification.py @@ -5,6 +5,7 @@ class GeoreviewClassification(AbsTaskClassification): + superseded_by = "GeoreviewClassification.v2" metadata = TaskMetadata( name="GeoreviewClassification", dataset={ @@ -35,3 +36,37 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, n_samples=2048, splits=["test"] ) + + +class GeoreviewClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="GeoreviewClassification.v2", + dataset={ + "path": "mteb/georeview", + "revision": "5194395f82217bc31212fd6a275002fb405f9dfb", + }, + description="""Review classification (5-point scale) based on Yandex Georeview dataset + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://github.com/yandex/geo-reviews-dataset-2023", + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["rus-Cyrl"], + main_score="accuracy", + date=("2023-01-01", "2023-08-01"), + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="mit", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation="", + prompt="Classify the organization rating based on the reviews", + adapted_from=["GeoreviewClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, n_samples=2048, splits=["test"] + ) diff --git a/mteb/tasks/Classification/rus/HeadlineClassification.py b/mteb/tasks/Classification/rus/HeadlineClassification.py index 9def591d0c..2d4725f221 100644 --- a/mteb/tasks/Classification/rus/HeadlineClassification.py +++ b/mteb/tasks/Classification/rus/HeadlineClassification.py @@ -5,6 +5,7 @@ class HeadlineClassification(AbsTaskClassification): + superseded_by = "HeadlineClassification.v2" metadata = TaskMetadata( name="HeadlineClassification", dataset={ @@ -59,3 +60,62 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, n_samples=2048, splits=["test"] ) + + +class HeadlineClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="HeadlineClassification.v2", + dataset={ + "path": "mteb/headline", + "revision": "6bd88e7778ee2e3bd8d0ade1be3ad5b6d969145a", + }, + description="""Headline rubric classification based on the paraphraser plus dataset. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://aclanthology.org/2020.ngt-1.6/", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["rus-Cyrl"], + main_score="accuracy", + date=("2009-01-01", "2020-01-01"), + domains=["News", "Written"], + task_subtypes=["Topic classification"], + license="mit", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{gudkov-etal-2020-automatically, + abstract = {The article is focused on automatic development and ranking of a large corpus for Russian paraphrase generation which proves to be the first corpus of such type in Russian computational linguistics. Existing manually annotated paraphrase datasets for Russian are limited to small-sized ParaPhraser corpus and ParaPlag which are suitable for a set of NLP tasks, such as paraphrase and plagiarism detection, sentence similarity and relatedness estimation, etc. Due to size restrictions, these datasets can hardly be applied in end-to-end text generation solutions. Meanwhile, paraphrase generation requires a large amount of training data. In our study we propose a solution to the problem: we collect, rank and evaluate a new publicly available headline paraphrase corpus (ParaPhraser Plus), and then perform text generation experiments with manual evaluation on automatically ranked corpora using the Universal Transformer architecture.}, + address = {Online}, + author = {Gudkov, Vadim and +Mitrofanova, Olga and +Filippskikh, Elizaveta}, + booktitle = {Proceedings of the Fourth Workshop on Neural Generation and Translation}, + doi = {10.18653/v1/2020.ngt-1.6}, + editor = {Birch, Alexandra and +Finch, Andrew and +Hayashi, Hiroaki and +Heafield, Kenneth and +Junczys-Dowmunt, Marcin and +Konstas, Ioannis and +Li, Xian and +Neubig, Graham and +Oda, Yusuke}, + month = jul, + pages = {54--59}, + publisher = {Association for Computational Linguistics}, + title = {Automatically Ranked {R}ussian Paraphrase Corpus for Text Generation}, + url = {https://aclanthology.org/2020.ngt-1.6}, + year = {2020}, +} +""", + prompt="Classify the topic or theme of the given news headline", + adapted_from=["HeadlineClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, n_samples=2048, splits=["test"] + ) diff --git a/mteb/tasks/Classification/rus/InappropriatenessClassification.py b/mteb/tasks/Classification/rus/InappropriatenessClassification.py index 7ff0ed11b2..0fea5995ce 100644 --- a/mteb/tasks/Classification/rus/InappropriatenessClassification.py +++ b/mteb/tasks/Classification/rus/InappropriatenessClassification.py @@ -5,6 +5,7 @@ class InappropriatenessClassification(AbsTaskClassification): + superseded_by = "InappropriatenessClassification.v2" metadata = TaskMetadata( name="InappropriatenessClassification", dataset={ @@ -65,6 +66,69 @@ def dataset_transform(self): ) +class InappropriatenessClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="InappropriatenessClassification.v2", + dataset={ + "path": "mteb/inappropriateness", + "revision": "2bdbb71d9b972709173f1477d7dd33c3d67f51ac", + }, + description="""Inappropriateness identification in the form of binary classification + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://aclanthology.org/2021.bsnlp-1.4", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["rus-Cyrl"], + main_score="accuracy", + date=("2006-01-01", "2021-04-01"), + domains=["Web", "Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-nc-sa-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{babakov-etal-2021-detecting, + abstract = {Not all topics are equally {``}flammable{''} in terms of toxicity: a calm discussion of turtles or fishing less often fuels inappropriate toxic dialogues than a discussion of politics or sexual minorities. We define a set of sensitive topics that can yield inappropriate and toxic messages and describe the methodology of collecting and labelling a dataset for appropriateness. While toxicity in user-generated data is well-studied, we aim at defining a more fine-grained notion of inappropriateness. The core of inappropriateness is that it can harm the reputation of a speaker. This is different from toxicity in two respects: (i) inappropriateness is topic-related, and (ii) inappropriate message is not toxic but still unacceptable. We collect and release two datasets for Russian: a topic-labelled dataset and an appropriateness-labelled dataset. We also release pre-trained classification models trained on this data.}, + address = {Kiyv, Ukraine}, + author = {Babakov, Nikolay and +Logacheva, Varvara and +Kozlova, Olga and +Semenov, Nikita and +Panchenko, Alexander}, + booktitle = {Proceedings of the 8th Workshop on Balto-Slavic Natural Language Processing}, + editor = {Babych, Bogdan and +Kanishcheva, Olga and +Nakov, Preslav and +Piskorski, Jakub and +Pivovarova, Lidia and +Starko, Vasyl and +Steinberger, Josef and +Yangarber, Roman and +Marci{\'n}czuk, Micha{\l} and +Pollak, Senja and +P{\v{r}}ib{\'a}{\v{n}}, Pavel and +Robnik-{\v{S}}ikonja, Marko}, + month = apr, + pages = {26--36}, + publisher = {Association for Computational Linguistics}, + title = {Detecting Inappropriate Messages on Sensitive Topics that Could Harm a Company{'}s Reputation}, + url = {https://aclanthology.org/2021.bsnlp-1.4}, + year = {2021}, +} +""", + prompt="Classify the given message as either sensitive topic or not", + adapted_from=["InappropriatenessClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, n_samples=2048, splits=["test"] + ) + + class InappropriatenessClassificationv2(AbsTaskClassification): metadata = TaskMetadata( name="InappropriatenessClassificationv2", diff --git a/mteb/tasks/Classification/rus/RuReviewsClassification.py b/mteb/tasks/Classification/rus/RuReviewsClassification.py index 37f9e83af3..6629869441 100644 --- a/mteb/tasks/Classification/rus/RuReviewsClassification.py +++ b/mteb/tasks/Classification/rus/RuReviewsClassification.py @@ -5,6 +5,7 @@ class RuReviewsClassification(AbsTaskClassification): + superseded_by = "RuReviewsClassification.v2" metadata = TaskMetadata( name="RuReviewsClassification", dataset={ @@ -47,3 +48,50 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, n_samples=2048, splits=["test"] ) + + +class RuReviewsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="RuReviewsClassification.v2", + dataset={ + "path": "mteb/ru_reviews", + "revision": "46d80ee5ac51be8234725558677e59050b9c418e", + }, + description="""Product review classification (3-point scale) based on RuRevies dataset + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://github.com/sismetanin/rureviews", + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["rus-Cyrl"], + main_score="accuracy", + date=("2000-01-01", "2020-01-01"), + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{Smetanin-SA-2019, + author = {Sergey Smetanin and Michail Komarov}, + booktitle = {2019 IEEE 21st Conference on Business Informatics (CBI)}, + doi = {10.1109/CBI.2019.00062}, + issn = {2378-1963}, + month = {July}, + number = {}, + pages = {482-486}, + title = {Sentiment Analysis of Product Reviews in Russian using Convolutional Neural Networks}, + volume = {01}, + year = {2019}, +} +""", + prompt="Classify product reviews into positive, negative or neutral sentiment", + adapted_from=["RuReviewsClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, n_samples=2048, splits=["test"] + ) diff --git a/mteb/tasks/Classification/rus/RuSciBenchGRNTIClassification.py b/mteb/tasks/Classification/rus/RuSciBenchGRNTIClassification.py index c2c737eea5..76bd475741 100644 --- a/mteb/tasks/Classification/rus/RuSciBenchGRNTIClassification.py +++ b/mteb/tasks/Classification/rus/RuSciBenchGRNTIClassification.py @@ -5,6 +5,8 @@ class RuSciBenchGRNTIClassification(AbsTaskClassification): + superseded_by = "RuSciBenchGRNTIClassification.v2" + metadata = TaskMetadata( name="RuSciBenchGRNTIClassification", dataset={ diff --git a/mteb/tasks/Classification/rus/RuSciBenchOECDClassification.py b/mteb/tasks/Classification/rus/RuSciBenchOECDClassification.py index b32f7c2b6e..33affb702e 100644 --- a/mteb/tasks/Classification/rus/RuSciBenchOECDClassification.py +++ b/mteb/tasks/Classification/rus/RuSciBenchOECDClassification.py @@ -5,6 +5,8 @@ class RuSciBenchOECDClassification(AbsTaskClassification): + superseded_by = "RuSciBenchOECDClassification.v2" + metadata = TaskMetadata( name="RuSciBenchOECDClassification", dataset={ diff --git a/mteb/tasks/Classification/rus/ru_toixic_classification_okmlcup.py b/mteb/tasks/Classification/rus/ru_toixic_classification_okmlcup.py index 8e511655e5..520114847f 100644 --- a/mteb/tasks/Classification/rus/ru_toixic_classification_okmlcup.py +++ b/mteb/tasks/Classification/rus/ru_toixic_classification_okmlcup.py @@ -5,6 +5,7 @@ class RuToxicOKMLCUPClassification(AbsTaskClassification): + superseded_by = "RuToxicOKMLCUPClassification.v2" metadata = TaskMetadata( name="RuToxicOKMLCUPClassification", dataset={ @@ -31,3 +32,31 @@ class RuToxicOKMLCUPClassification(AbsTaskClassification): def dataset_transform(self): self.dataset = self.dataset.rename_column("toxic", "label") + + +class RuToxicOKMLCUPClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="RuToxicOKMLCUPClassification.v2", + dataset={ + "path": "mteb/ru_toxic_okmlcup", + "revision": "729025d2cfa68fcbc587ea80014a42d569cd9048", + }, + description="""On the Odnoklassniki social network, users post a huge number of comments of various directions and nature every day. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://cups.online/ru/contests/okmlcup2020", + type="Classification", + category="t2t", + modalities=["text"], + eval_splits=["test"], + eval_langs=["rus-Cyrl"], + main_score="accuracy", + date=("2015-01-01", "2020-01-01"), + domains=[], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation="""""", + adapted_from=["RuToxicOKMLCUPClassification"], + ) diff --git a/mteb/tasks/Classification/rus/senti_ru_eval.py b/mteb/tasks/Classification/rus/senti_ru_eval.py index a935dd8c76..a7c6fdbf11 100644 --- a/mteb/tasks/Classification/rus/senti_ru_eval.py +++ b/mteb/tasks/Classification/rus/senti_ru_eval.py @@ -5,6 +5,7 @@ class SentiRuEval2016Classification(AbsTaskClassification): + superseded_by = "SentiRuEval2016.v2" metadata = TaskMetadata( name="SentiRuEval2016", dataset={ @@ -38,3 +39,39 @@ class SentiRuEval2016Classification(AbsTaskClassification): } """, ) + + +class SentiRuEval2016ClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="SentiRuEval2016.v2", + dataset={ + "path": "mteb/senti_ru_eval2016", + "revision": "bfa4cbec1753ffed29a8244a4ec208cc9e6c09a0", + }, + description="""Russian sentiment analysis evaluation SentiRuEval-2016 devoted to reputation monitoring of banks and telecom companies in Twitter. We describe the task, data, the procedure of data preparation, and participants’ results. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://github.com/mokoron/sentirueval", + type="Classification", + category="t2t", + modalities=["text"], + eval_splits=["test"], + eval_langs=["rus-Cyrl"], + main_score="accuracy", + date=("2015-01-01", "2016-01-01"), + domains=[], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{loukachevitch2016sentirueval, + author = {Loukachevitch, NV and Rubtsova, Yu V}, + booktitle = {Computational Linguistics and Intellectual Technologies}, + pages = {416--426}, + title = {SentiRuEval-2016: overcoming time gap and data sparsity in tweet sentiment analysis}, + year = {2016}, +} +""", + adapted_from=["SentiRuEval2016Classification"], + ) diff --git a/mteb/tasks/Classification/san/SanskritShlokasClassification.py b/mteb/tasks/Classification/san/SanskritShlokasClassification.py index 91b8436e8d..daff38224f 100644 --- a/mteb/tasks/Classification/san/SanskritShlokasClassification.py +++ b/mteb/tasks/Classification/san/SanskritShlokasClassification.py @@ -17,7 +17,7 @@ class SanskritShlokasClassification(AbsTaskClassification): category="s2s", modalities=["text"], date=("2019-01-01", "2020-01-01"), - eval_splits=["train", "validation"], + eval_splits=["validation"], eval_langs=["san-Deva"], main_score="accuracy", domains=["Religious", "Written"], diff --git a/mteb/tasks/Classification/sin/SinhalaNewsClassification.py b/mteb/tasks/Classification/sin/SinhalaNewsClassification.py index 4b8c54a184..3d050607f5 100644 --- a/mteb/tasks/Classification/sin/SinhalaNewsClassification.py +++ b/mteb/tasks/Classification/sin/SinhalaNewsClassification.py @@ -5,6 +5,7 @@ class SinhalaNewsClassification(AbsTaskClassification): + superseded_by = "SinhalaNewsClassification.v2" metadata = TaskMetadata( name="SinhalaNewsClassification", description="This file contains news texts (sentences) belonging to 5 different news categories (political, business, technology, sports and Entertainment). The original dataset was released by Nisansa de Silva (Sinhala Text Classification: Observations from the Perspective of a Resource Poor Language, 2015).", @@ -50,3 +51,50 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["train"] ) + + +class SinhalaNewsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="SinhalaNewsClassification.v2", + description="""This file contains news texts (sentences) belonging to 5 different news categories (political, business, technology, sports and Entertainment). The original dataset was released by Nisansa de Silva (Sinhala Text Classification: Observations from the Perspective of a Resource Poor Language, 2015). + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + dataset={ + "path": "mteb/sinhala_news", + "revision": "e0b6e93ed5f086fe358595dff1aaad9eb877667a", + }, + reference="https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Category-classification", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["sin-Sinh"], + main_score="accuracy", + date=("2019-03-17", "2020-08-06"), + domains=["News", "Written"], + task_subtypes=["Topic classification"], + license="mit", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{deSilva2015, + author = {Nisansa de Silva}, + journal = {Year of Publication}, + title = {Sinhala Text Classification: Observations from the Perspective of a Resource Poor Language}, + year = {2015}, +} + +@article{dhananjaya2022, + author = {Dhananjaya et al.}, + journal = {Year of Publication}, + title = {BERTifying Sinhala - A Comprehensive Analysis of Pre-trained Language Models for Sinhala Text Classification}, + year = {2022}, +} +""", + adapted_from=["SinhalaNewsClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["train"] + ) diff --git a/mteb/tasks/Classification/sin/SinhalaNewsSourceClassification.py b/mteb/tasks/Classification/sin/SinhalaNewsSourceClassification.py index a665d1c0c9..0cb1bd9cd8 100644 --- a/mteb/tasks/Classification/sin/SinhalaNewsSourceClassification.py +++ b/mteb/tasks/Classification/sin/SinhalaNewsSourceClassification.py @@ -5,6 +5,7 @@ class SinhalaNewsSourceClassification(AbsTaskClassification): + superseded_by = "SinhalaNewsSourceClassification.v2" metadata = TaskMetadata( name="SinhalaNewsSourceClassification", description="This dataset contains Sinhala news headlines extracted from 9 news sources (websites) (Sri Lanka Army, Dinamina, GossipLanka, Hiru, ITN, Lankapuwath, NewsLK, Newsfirst, World Socialist Web Site-Sinhala).", @@ -41,3 +42,43 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["train"] ) + + +class SinhalaNewsSourceClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="SinhalaNewsSourceClassification.v2", + description="""This dataset contains Sinhala news headlines extracted from 9 news sources (websites) (Sri Lanka Army, Dinamina, GossipLanka, Hiru, ITN, Lankapuwath, NewsLK, Newsfirst, World Socialist Web Site-Sinhala). + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + dataset={ + "path": "mteb/sinhala_news_source", + "revision": "6902767dbfa6189cbe5f5b5b56ee6300b1702d33", + }, + reference="https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Source-classification", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["sin-Sinh"], + main_score="accuracy", + date=("2021-02-17", "2022-08-20"), + domains=["News", "Written"], + task_subtypes=["Topic classification"], + license="mit", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{dhananjaya2022, + author = {Dhananjaya et al.}, + journal = {Year of Publication}, + title = {BERTifying Sinhala - A Comprehensive Analysis of Pre-trained Language Models for Sinhala Text Classification}, + year = {2022}, +} +""", + adapted_from=["SinhalaNewsSourceClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["train"] + ) diff --git a/mteb/tasks/Classification/slk/CSFDSKMovieReviewSentimentClassification.py b/mteb/tasks/Classification/slk/CSFDSKMovieReviewSentimentClassification.py index 6577f7f315..ee6c22e79c 100644 --- a/mteb/tasks/Classification/slk/CSFDSKMovieReviewSentimentClassification.py +++ b/mteb/tasks/Classification/slk/CSFDSKMovieReviewSentimentClassification.py @@ -3,8 +3,11 @@ from mteb.abstasks.AbsTaskClassification import AbsTaskClassification from mteb.abstasks.TaskMetadata import TaskMetadata +N_SAMPLES = 2048 + class CSFDSKMovieReviewSentimentClassification(AbsTaskClassification): + superseded_by = "CSFDSKMovieReviewSentimentClassification.v2" metadata = TaskMetadata( name="CSFDSKMovieReviewSentimentClassification", description="The dataset contains 30k user reviews from csfd.cz in Slovak.", @@ -42,11 +45,54 @@ class CSFDSKMovieReviewSentimentClassification(AbsTaskClassification): samples_per_label = 20 def dataset_transform(self): - N_SAMPLES = 2048 - self.dataset = self.dataset.rename_columns( {"comment": "text", "rating_int": "label"} ) self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["test"], n_samples=N_SAMPLES ) + + +class CSFDSKMovieReviewSentimentClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="CSFDSKMovieReviewSentimentClassification.v2", + description="""The dataset contains 30k user reviews from csfd.cz in Slovak. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://arxiv.org/abs/2304.01922", + dataset={ + "path": "mteb/csfdsk_movie_review_sentiment", + "revision": "257ee340c1399ab5e038a3aea38877f67940774d", + }, + type="Classification", + category="s2s", + modalities=["text"], + date=("2002-05-21", "2020-03-05"), + eval_splits=["test"], + eval_langs=["slk-Latn"], + main_score="accuracy", + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{štefánik2023resources, + archiveprefix = {arXiv}, + author = {Michal Štefánik and Marek Kadlčík and Piotr Gramacki and Petr Sojka}, + eprint = {2304.01922}, + primaryclass = {cs.CL}, + title = {Resources and Few-shot Learners for In-context Learning in Slavic Languages}, + year = {2023}, +} +""", + adapted_from=["CSFDSKMovieReviewSentimentClassification"], + ) + + # Increase the samples_per_label in order to improve baseline performance + samples_per_label = 20 + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"], n_samples=N_SAMPLES + ) diff --git a/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py b/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py index bd131ece85..6a4823b6e9 100644 --- a/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py +++ b/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py @@ -5,6 +5,7 @@ class SlovakHateSpeechClassification(AbsTaskClassification): + superseded_by = "SlovakHateSpeechClassification.v2" metadata = TaskMetadata( name="SlovakHateSpeechClassification", description="The dataset contains posts from a social network with human annotations for hateful or offensive language in Slovak.", @@ -28,3 +29,31 @@ class SlovakHateSpeechClassification(AbsTaskClassification): sample_creation="found", bibtex_citation="", ) + + +class SlovakHateSpeechClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="SlovakHateSpeechClassification.v2", + description="""The dataset contains posts from a social network with human annotations for hateful or offensive language in Slovak. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/TUKE-KEMT/hate_speech_slovak", + dataset={ + "path": "mteb/slovak_hate_speech", + "revision": "691fe861df0ffa25066cbf6da8e64ebd296af6ab", + }, + type="Classification", + category="s2s", + modalities=["text"], + date=("2024-05-25", "2024-06-06"), + eval_splits=["test"], + eval_langs=["slk-Latn"], + main_score="accuracy", + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-sa-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="", + adapted_from=["SlovakHateSpeechClassification"], + ) diff --git a/mteb/tasks/Classification/slv/FrenkSlClassification.py b/mteb/tasks/Classification/slv/FrenkSlClassification.py index f88d4ff9ff..6a5c765a8d 100644 --- a/mteb/tasks/Classification/slv/FrenkSlClassification.py +++ b/mteb/tasks/Classification/slv/FrenkSlClassification.py @@ -5,6 +5,7 @@ class FrenkSlClassification(AbsTaskClassification): + superseded_by = "FrenkSlClassification.v2" metadata = TaskMetadata( name="FrenkSlClassification", description="Slovenian subset of the FRENK dataset. Also available on HuggingFace dataset hub: English subset, Croatian subset.", @@ -44,3 +45,46 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["test"] ) + + +class FrenkSlClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="FrenkSlClassification.v2", + description="""Slovenian subset of the FRENK dataset. Also available on HuggingFace dataset hub: English subset, Croatian subset. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + dataset={ + "path": "mteb/frenk_sl", + "revision": "3b69facc14651fbd152fda173683a7ecf9125b82", + }, + reference="https://arxiv.org/pdf/1906.02045", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["slv-Latn"], + main_score="accuracy", + date=("2021-05-28", "2021-05-28"), + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{ljubešić2019frenk, + archiveprefix = {arXiv}, + author = {Nikola Ljubešić and Darja Fišer and Tomaž Erjavec}, + eprint = {1906.02045}, + primaryclass = {cs.CL}, + title = {The FRENK Datasets of Socially Unacceptable Discourse in Slovene and English}, + url = {https://arxiv.org/abs/1906.02045}, + year = {2019}, +} +""", + adapted_from=["FrenkSlClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/mteb/tasks/Classification/spa/SpanishNewsClassification.py b/mteb/tasks/Classification/spa/SpanishNewsClassification.py index 59ac97ba20..5543bab43e 100644 --- a/mteb/tasks/Classification/spa/SpanishNewsClassification.py +++ b/mteb/tasks/Classification/spa/SpanishNewsClassification.py @@ -5,6 +5,7 @@ class SpanishNewsClassification(AbsTaskClassification): + superseded_by = "SpanishNewsClassification.v2" metadata = TaskMetadata( name="SpanishNewsClassification", description="A Spanish dataset for news classification. The dataset includes articles from reputable Spanish news sources spanning 12 different categories.", @@ -35,3 +36,37 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["train"] ) + + +class SpanishNewsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="SpanishNewsClassification.v2", + description="""A Spanish dataset for news classification. The dataset includes articles from reputable Spanish news sources spanning 12 different categories. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/MarcOrfilaCarreras/spanish-news", + dataset={ + "path": "mteb/spanish_news", + "revision": "345aa68ec44052d28828c6f88e7a2aafaf74be5a", + }, + type="Classification", + category="s2s", + modalities=["text"], + date=("2023-05-01", "2024-05-01"), + eval_splits=["test"], + eval_langs=["spa-Latn"], + main_score="accuracy", + domains=["News", "Written"], + task_subtypes=[], + license="mit", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" + """, + adapted_from=["SpanishNewsClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["train"] + ) diff --git a/mteb/tasks/Classification/spa/SpanishSentimentClassification.py b/mteb/tasks/Classification/spa/SpanishSentimentClassification.py index 785b131bbc..49f88eff08 100644 --- a/mteb/tasks/Classification/spa/SpanishSentimentClassification.py +++ b/mteb/tasks/Classification/spa/SpanishSentimentClassification.py @@ -5,6 +5,7 @@ class SpanishSentimentClassification(AbsTaskClassification): + superseded_by = "SpanishSentimentClassification.v2" metadata = TaskMetadata( name="SpanishSentimentClassification", description="A Spanish dataset for sentiment classification.", @@ -52,3 +53,55 @@ class SpanishSentimentClassification(AbsTaskClassification): } """, ) + + +class SpanishSentimentClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="SpanishSentimentClassification.v2", + description="""A Spanish dataset for sentiment classification. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/sepidmnorozy/Spanish_sentiment", + dataset={ + "path": "mteb/spanish_sentiment", + "revision": "307dea211013736d7d146dad9d2f6330e44d29b8", + }, + type="Classification", + category="s2s", + modalities=["text"], + date=("2022-08-16", "2022-08-16"), + eval_splits=["validation", "test"], + eval_langs=["spa-Latn"], + main_score="accuracy", + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{mollanorozy-etal-2023-cross, + address = {Dubrovnik, Croatia}, + author = {Mollanorozy, Sepideh and +Tanti, Marc and +Nissim, Malvina}, + booktitle = {Proceedings of the 5th Workshop on Research in Computational Linguistic Typology and Multilingual NLP}, + doi = {10.18653/v1/2023.sigtyp-1.9}, + editor = {Beinborn, Lisa and +Goswami, Koustava and +Murado{\\u{g}}lu, Saliha and +Sorokin, Alexey and +Kumar, Ritesh and +Shcherbakov, Andreas and +Ponti, Edoardo M. and +Cotterell, Ryan and +Vylomova, Ekaterina}, + month = may, + pages = {89--95}, + publisher = {Association for Computational Linguistics}, + title = {Cross-lingual Transfer Learning with \{P\}ersian}, + url = {https://aclanthology.org/2023.sigtyp-1.9}, + year = {2023}, +} +""", + adapted_from=["SpanishSentimentClassification"], + ) diff --git a/mteb/tasks/Classification/ssw/SiswatiNewsClassification.py b/mteb/tasks/Classification/ssw/SiswatiNewsClassification.py index e5b667e289..297eb94438 100644 --- a/mteb/tasks/Classification/ssw/SiswatiNewsClassification.py +++ b/mteb/tasks/Classification/ssw/SiswatiNewsClassification.py @@ -5,6 +5,7 @@ class SiswatiNewsClassification(AbsTaskClassification): + superseded_by = "SiswatiNewsClassification.v2" metadata = TaskMetadata( name="SiswatiNewsClassification", description="Siswati News Classification Dataset", @@ -41,3 +42,41 @@ class SiswatiNewsClassification(AbsTaskClassification): def dataset_transform(self): self.dataset = self.dataset.rename_columns({"title": "text"}) + + +class SiswatiNewsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="SiswatiNewsClassification.v2", + description="""Siswati News Classification Dataset + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/dsfsi/za-isizulu-siswati-news", + dataset={ + "path": "mteb/siswati_news", + "revision": "e316774d8bbaa9b43858d093ea0f1eb38c6a9b4c", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["ssw-Latn"], + main_score="accuracy", + date=("2022-08-01", "2022-08-01"), + domains=["News", "Written"], + task_subtypes=["Topic classification"], + license="cc-by-sa-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{Madodonga_Marivate_Adendorff_2023, + author = {Madodonga, Andani and Marivate, Vukosi and Adendorff, Matthew}, + doi = {10.55492/dhasa.v4i01.4449}, + month = {Jan.}, + title = {Izindaba-Tindzaba: Machine learning news categorisation for Long and Short Text for isiZulu and Siswati}, + url = {https://upjournals.up.ac.za/index.php/dhasa/article/view/4449}, + volume = {4}, + year = {2023}, +} +""", + adapted_from=["SiswatiNewsClassification"], + ) diff --git a/mteb/tasks/Classification/svk/SlovakMovieReviewSentimentClassification.py b/mteb/tasks/Classification/svk/SlovakMovieReviewSentimentClassification.py index 25df08775d..109a237052 100644 --- a/mteb/tasks/Classification/svk/SlovakMovieReviewSentimentClassification.py +++ b/mteb/tasks/Classification/svk/SlovakMovieReviewSentimentClassification.py @@ -5,6 +5,7 @@ class SlovakMovieReviewSentimentClassification(AbsTaskClassification): + superseded_by = "SlovakMovieReviewSentimentClassification.v2" metadata = TaskMetadata( name="SlovakMovieReviewSentimentClassification", description="User reviews of movies on the CSFD movie database, with 2 sentiment classes (positive, negative)", @@ -42,3 +43,43 @@ def dataset_transform(self) -> None: self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["test"] ) + + +class SlovakMovieReviewSentimentClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="SlovakMovieReviewSentimentClassification.v2", + description="""User reviews of movies on the CSFD movie database, with 2 sentiment classes (positive, negative) + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://arxiv.org/pdf/2304.01922", + dataset={ + "path": "mteb/slovak_movie_review_sentiment", + "revision": "29a7405aabcfd4860a51ae6f65a5650d63108f26", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["svk-Latn"], + main_score="accuracy", + date=("2002-05-21", "2020-03-05"), + dialect=[], + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + sample_creation="found", + bibtex_citation=r""" +@article{vstefanik2023resources, + author = {{\v{S}}tef{\'a}nik, Michal and Kadl{\v{c}}{\'\i}k, Marek and Gramacki, Piotr and Sojka, Petr}, + journal = {arXiv preprint arXiv:2304.01922}, + title = {Resources and Few-shot Learners for In-context Learning in Slavic Languages}, + year = {2023}, +} +""", + adapted_from=["SlovakMovieReviewSentimentClassification"], + ) + + def dataset_transform(self) -> None: + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/mteb/tasks/Classification/swa/SwahiliNewsClassification.py b/mteb/tasks/Classification/swa/SwahiliNewsClassification.py index 518b749de0..abf644702d 100644 --- a/mteb/tasks/Classification/swa/SwahiliNewsClassification.py +++ b/mteb/tasks/Classification/swa/SwahiliNewsClassification.py @@ -5,6 +5,7 @@ class SwahiliNewsClassification(AbsTaskClassification): + superseded_by = "SwahiliNewsClassification.v2" metadata = TaskMetadata( name="SwahiliNewsClassification", description="Dataset for Swahili News Classification, categorized with 6 domains (Local News (Kitaifa), International News (Kimataifa), Finance News (Uchumi), Health News (Afya), Sports News (Michezo), and Entertainment News (Burudani)). Building and Optimizing Swahili Language Models: Techniques, Embeddings, and Datasets", @@ -45,3 +46,45 @@ def dataset_transform(self) -> None: self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["train"] ) + + +class SwahiliNewsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="SwahiliNewsClassification.v2", + description="""Dataset for Swahili News Classification, categorized with 6 domains (Local News (Kitaifa), International News (Kimataifa), Finance News (Uchumi), Health News (Afya), Sports News (Michezo), and Entertainment News (Burudani)). Building and Optimizing Swahili Language Models: Techniques, Embeddings, and Datasets + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/Mollel/SwahiliNewsClassification", + dataset={ + "path": "mteb/swahili_news", + "revision": "d929055f41849d5bc3533c07d978fcfbc89d6a4e", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["swa-Latn"], + main_score="accuracy", + date=("2019-01-01", "2023-05-01"), + dialect=[], + domains=["News", "Written"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + sample_creation="found", + bibtex_citation=r""" +@inproceedings{davis2020swahili, + author = {Davis, David}, + doi = {10.5281/zenodo.5514203}, + publisher = {Zenodo}, + title = {Swahili: News Classification Dataset (0.2)}, + url = {https://doi.org/10.5281/zenodo.5514203}, + year = {2020}, +} +""", + adapted_from=["SwahiliNewsClassification"], + ) + + def dataset_transform(self) -> None: + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["train"] + ) diff --git a/mteb/tasks/Classification/swe/DalajClassification.py b/mteb/tasks/Classification/swe/DalajClassification.py index 05983d0e4f..36e594e569 100644 --- a/mteb/tasks/Classification/swe/DalajClassification.py +++ b/mteb/tasks/Classification/swe/DalajClassification.py @@ -6,6 +6,7 @@ class DalajClassification(AbsTaskClassification): + superseded_by = "DalajClassification.v2" metadata = TaskMetadata( name="DalajClassification", dataset={ @@ -67,3 +68,42 @@ def __convert_sample_to_classification(sample): batched=True, remove_columns=columns_to_keep, ) + + +class DalajClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="DalajClassification.v2", + dataset={ + "path": "mteb/dalaj", + "revision": "ecf6f2d83e8e85816ec3974896557a4aafce4f3e", + "name": "dalaj", + }, + description="""A Swedish dataset for linguistic acceptability. Available as a part of Superlim. + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://spraakbanken.gu.se/en/resources/superlim", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["swe-Latn"], + main_score="accuracy", + date=("2017-01-01", "2020-12-31"), + domains=["Non-fiction", "Written"], + task_subtypes=["Linguistic acceptability"], + license="cc-by-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="created", + bibtex_citation=r""" +@misc{2105.06681, + author = {Elena Volodina and Yousuf Ali Mohammed and Julia Klezl}, + eprint = {arXiv:2105.06681}, + title = {DaLAJ - a dataset for linguistic acceptability judgments for Swedish: Format, baseline, sharing}, + year = {2021}, +} +""", + prompt="Classify texts based on linguistic acceptability in Swedish", + adapted_from=["DalajClassification"], + ) + + samples_per_label = 16 diff --git a/mteb/tasks/Classification/swe/SweRecClassification.py b/mteb/tasks/Classification/swe/SweRecClassification.py index 8cc7b8dff8..5de86af7bf 100644 --- a/mteb/tasks/Classification/swe/SweRecClassification.py +++ b/mteb/tasks/Classification/swe/SweRecClassification.py @@ -5,6 +5,7 @@ class SweRecClassification(AbsTaskClassification): + superseded_by = "SweRecClassification.v2" metadata = TaskMetadata( name="SweRecClassification", description="A Swedish dataset for sentiment classification on review", @@ -43,3 +44,46 @@ class SweRecClassification(AbsTaskClassification): """, prompt="Classify Swedish reviews by sentiment", ) + + +class SweRecClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="SweRecClassification.v2", + description="""A Swedish dataset for sentiment classification on review + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://aclanthology.org/2023.nodalida-1.20/", + dataset={ + "path": "mteb/swe_rec", + "revision": "2a18a4ccc6770319b7f717cda1800f7d5bd5cd1a", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["swe-Latn"], + main_score="accuracy", + date=("2023-01-01", "2023-12-31"), # based on the publication date + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{nielsen-2023-scandeval, + address = {T{\'o}rshavn, Faroe Islands}, + author = {Nielsen, Dan}, + booktitle = {Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)}, + editor = {Alum{\"a}e, Tanel and +Fishel, Mark}, + month = may, + pages = {185--201}, + publisher = {University of Tartu Library}, + title = {{S}cand{E}val: A Benchmark for {S}candinavian Natural Language Processing}, + url = {https://aclanthology.org/2023.nodalida-1.20}, + year = {2023}, +} +""", + prompt="Classify Swedish reviews by sentiment", + adapted_from=["SweRecClassification"], + ) diff --git a/mteb/tasks/Classification/swe/SwedishSentimentClassification.py b/mteb/tasks/Classification/swe/SwedishSentimentClassification.py index 149be829fc..4541b5622f 100644 --- a/mteb/tasks/Classification/swe/SwedishSentimentClassification.py +++ b/mteb/tasks/Classification/swe/SwedishSentimentClassification.py @@ -5,6 +5,7 @@ class SwedishSentimentClassification(AbsTaskClassification): + superseded_by = "SwedishSentimentClassification.v2" metadata = TaskMetadata( name="SwedishSentimentClassification", description="Dataset of Swedish reviews scarped from various public available websites", @@ -28,3 +29,31 @@ class SwedishSentimentClassification(AbsTaskClassification): sample_creation="found", bibtex_citation="", ) + + +class SwedishSentimentClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="SwedishSentimentClassification.v2", + description="""Dataset of Swedish reviews scarped from various public available websites + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/swedish_reviews", + dataset={ + "path": "mteb/swedish_sentiment", + "revision": "f521560ac618eea57c85392c574c16b6c08c9487", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["validation", "test"], + eval_langs=["swe-Latn"], + main_score="accuracy", + date=("2021-01-01", "2022-01-01"), + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation="", + adapted_from=["SwedishSentimentClassification"], + ) diff --git a/mteb/tasks/Classification/tam/TamilNewsClassification.py b/mteb/tasks/Classification/tam/TamilNewsClassification.py index 3f4505bce8..af5a2f2c71 100644 --- a/mteb/tasks/Classification/tam/TamilNewsClassification.py +++ b/mteb/tasks/Classification/tam/TamilNewsClassification.py @@ -5,6 +5,7 @@ class TamilNewsClassification(AbsTaskClassification): + superseded_by = "TamilNewsClassification.v2" metadata = TaskMetadata( name="TamilNewsClassification", description="A Tamil dataset for 6-class classification of Tamil news articles", @@ -41,3 +42,41 @@ def dataset_transform(self): {"NewsInTamil": "text", "Category": "label"} ) self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed) + + +class TamilNewsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="TamilNewsClassification.v2", + description="""A Tamil dataset for 6-class classification of Tamil news articles + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://github.com/vanangamudi/tamil-news-classification", + dataset={ + "path": "mteb/tamil_news", + "revision": "b417dba1f5a3143f8325b6b6fb585ab4a57c03a0", + }, + type="Classification", + category="s2s", + modalities=["text"], + date=("2014-01-01", "2018-01-01"), + eval_splits=["test"], + eval_langs=["tam-Taml"], + main_score="f1", + domains=["News", "Written"], + task_subtypes=["Topic classification"], + license="mit", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{kunchukuttan2020indicnlpcorpus, + author = {Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar}, + journal = {arXiv preprint arXiv:2005.00085}, + title = {AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages}, + year = {2020}, +} +""", + adapted_from=["TamilNewsClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed) diff --git a/mteb/tasks/Classification/tel/TeluguAndhraJyotiNewsClassification.py b/mteb/tasks/Classification/tel/TeluguAndhraJyotiNewsClassification.py index 3d07293c64..3daf1f54cf 100644 --- a/mteb/tasks/Classification/tel/TeluguAndhraJyotiNewsClassification.py +++ b/mteb/tasks/Classification/tel/TeluguAndhraJyotiNewsClassification.py @@ -5,6 +5,7 @@ class TeluguAndhraJyotiNewsClassification(AbsTaskClassification): + superseded_by = "TeluguAndhraJyotiNewsClassification.v2" metadata = TaskMetadata( name="TeluguAndhraJyotiNewsClassification", description="A Telugu dataset for 5-class classification of Telugu news articles", @@ -32,3 +33,34 @@ class TeluguAndhraJyotiNewsClassification(AbsTaskClassification): def dataset_transform(self): self.dataset = self.dataset.rename_columns({"body": "text", "topic": "label"}) self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed) + + +class TeluguAndhraJyotiNewsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="TeluguAndhraJyotiNewsClassification.v2", + description="""A Telugu dataset for 5-class classification of Telugu news articles + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://github.com/AnushaMotamarri/Telugu-Newspaper-Article-Dataset", + dataset={ + "path": "mteb/telugu_andhra_jyoti_news", + "revision": "032752fb5f3a8c5b0814061c6502e0e8d58ec77c", + }, + type="Classification", + category="s2s", + modalities=["text"], + date=("2014-01-01", "2018-01-01"), + eval_splits=["test"], + eval_langs=["tel-Telu"], + main_score="f1", + domains=["News", "Written"], + task_subtypes=["Topic classification"], + license="mit", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation="", + adapted_from=["TeluguAndhraJyotiNewsClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed) diff --git a/mteb/tasks/Classification/tha/WisesightSentimentClassification.py b/mteb/tasks/Classification/tha/WisesightSentimentClassification.py index 242249340a..b829c3b2c9 100644 --- a/mteb/tasks/Classification/tha/WisesightSentimentClassification.py +++ b/mteb/tasks/Classification/tha/WisesightSentimentClassification.py @@ -5,6 +5,7 @@ class WisesightSentimentClassification(AbsTaskClassification): + superseded_by = "WisesightSentimentClassification.v2" metadata = TaskMetadata( name="WisesightSentimentClassification", description="Wisesight Sentiment Corpus: Social media messages in Thai language with sentiment label (positive, neutral, negative, question)", @@ -42,3 +43,45 @@ class WisesightSentimentClassification(AbsTaskClassification): } """, ) + + +class WisesightSentimentClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="WisesightSentimentClassification.v2", + description="""Wisesight Sentiment Corpus: Social media messages in Thai language with sentiment label (positive, neutral, negative, question) + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://github.com/PyThaiNLP/wisesight-sentiment", + dataset={ + "path": "mteb/wisesight_sentiment", + "revision": "aa2a5976a75df7f667215ac14353b3f5d07ba598", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["tha-Thai"], + main_score="f1", + date=("2019-05-24", "2021-09-16"), + dialect=[], + domains=["Social", "News", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc0-1.0", + annotations_creators="expert-annotated", + sample_creation="found", + bibtex_citation=r""" +@software{bact_2019_3457447, + author = {Suriyawongkul, Arthit and +Chuangsuwanich, Ekapol and +Chormai, Pattarawat and +Polpanumas, Charin}, + doi = {10.5281/zenodo.3457447}, + month = sep, + publisher = {Zenodo}, + title = {PyThaiNLP/wisesight-sentiment: First release}, + url = {https://doi.org/10.5281/zenodo.3457447}, + version = {v1.0}, + year = {2019}, +} +""", + adapted_from=["WisesightSentimentClassification"], + ) diff --git a/mteb/tasks/Classification/tha/WongnaiReviewsClassification .py b/mteb/tasks/Classification/tha/WongnaiReviewsClassification .py index 9a51214759..94f23830c6 100644 --- a/mteb/tasks/Classification/tha/WongnaiReviewsClassification .py +++ b/mteb/tasks/Classification/tha/WongnaiReviewsClassification .py @@ -6,7 +6,7 @@ class WongnaiReviewsClassification(AbsTaskClassification): metadata = TaskMetadata( - name="WongnaiReviewsClassification ", + name="WongnaiReviewsClassification", description="Wongnai features over 200,000 restaurants, beauty salons, and spas across Thailand on its platform, with detailed information about each merchant and user reviews. In this dataset there are 5 classes corressponding each star rating", reference="https://github.com/wongnai/wongnai-corpus", dataset={ diff --git a/mteb/tasks/Classification/tsn/TswanaNewsClassification.py b/mteb/tasks/Classification/tsn/TswanaNewsClassification.py index e9095fd0d3..4d0fd118e2 100644 --- a/mteb/tasks/Classification/tsn/TswanaNewsClassification.py +++ b/mteb/tasks/Classification/tsn/TswanaNewsClassification.py @@ -5,6 +5,7 @@ class TswanaNewsClassification(AbsTaskClassification): + superseded_by = "TswanaNewsClassification.v2" metadata = TaskMetadata( name="TswanaNewsClassification", description="Tswana News Classification Dataset", @@ -39,3 +40,42 @@ class TswanaNewsClassification(AbsTaskClassification): } """, ) + + +class TswanaNewsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="TswanaNewsClassification.v2", + description="""Tswana News Classification Dataset + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://link.springer.com/chapter/10.1007/978-3-031-49002-6_17", + dataset={ + "path": "mteb/tswana_news", + "revision": "2bbd0687d1733ac419fba18378bd9d864aae081c", + }, + type="Classification", + task_subtypes=["Topic classification"], + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["tsn-Latn"], + main_score="accuracy", + date=("2015-01-01", "2023-01-01"), + domains=["News", "Written"], + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{marivate2023puoberta, + author = {Vukosi Marivate and Moseli Mots'Oehli and Valencia Wagner and Richard Lastrucci and Isheanesu Dzingirai}, + booktitle = {SACAIR 2023 (To Appear)}, + dataset_url = {https://github.com/dsfsi/PuoBERTa}, + keywords = {NLP}, + preprint_url = {https://arxiv.org/abs/2310.09141}, + software_url = {https://huggingface.co/dsfsi/PuoBERTa}, + title = {PuoBERTa: Training and evaluation of a curated language model for Setswana}, + year = {2023}, +} +""", + adapted_from=["TswanaNewsClassification"], + ) diff --git a/mteb/tasks/Classification/tur/TurkishMovieSentimentClassification.py b/mteb/tasks/Classification/tur/TurkishMovieSentimentClassification.py index 680b52009b..e0f97dd325 100644 --- a/mteb/tasks/Classification/tur/TurkishMovieSentimentClassification.py +++ b/mteb/tasks/Classification/tur/TurkishMovieSentimentClassification.py @@ -5,6 +5,7 @@ class TurkishMovieSentimentClassification(AbsTaskClassification): + superseded_by = "TurkishMovieSentimentClassification.v2" metadata = TaskMetadata( name="TurkishMovieSentimentClassification", description="Turkish Movie Review Dataset", @@ -41,3 +42,44 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["test"] ) + + +class TurkishMovieSentimentClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="TurkishMovieSentimentClassification.v2", + description="""Turkish Movie Review Dataset + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://www.win.tue.nl/~mpechen/publications/pubs/MT_WISDOM2013.pdf", + dataset={ + "path": "mteb/turkish_movie_sentiment", + "revision": "8ef5ce93ff2504de7fc46776317b78bdd8db47f2", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["tur-Latn"], + main_score="accuracy", + date=("2013-01-01", "2013-08-11"), + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{Demirtas2013CrosslingualPD, + author = {Erkin Demirtas and Mykola Pechenizkiy}, + booktitle = {wisdom}, + title = {Cross-lingual polarity detection with machine translation}, + url = {https://api.semanticscholar.org/CorpusID:3912960}, + year = {2013}, +} +""", + adapted_from=["TurkishMovieSentimentClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/mteb/tasks/Classification/tur/TurkishProductSentimentClassification.py b/mteb/tasks/Classification/tur/TurkishProductSentimentClassification.py index 7bfb086d99..98f089565a 100644 --- a/mteb/tasks/Classification/tur/TurkishProductSentimentClassification.py +++ b/mteb/tasks/Classification/tur/TurkishProductSentimentClassification.py @@ -5,6 +5,7 @@ class TurkishProductSentimentClassification(AbsTaskClassification): + superseded_by = "TurkishProductSentimentClassification.v2" metadata = TaskMetadata( name="TurkishProductSentimentClassification", description="Turkish Product Review Dataset", @@ -36,3 +37,39 @@ class TurkishProductSentimentClassification(AbsTaskClassification): } """, ) + + +class TurkishProductSentimentClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="TurkishProductSentimentClassification.v2", + description="""Turkish Product Review Dataset + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://www.win.tue.nl/~mpechen/publications/pubs/MT_WISDOM2013.pdf", + dataset={ + "path": "mteb/turkish_product_sentiment", + "revision": "c846c08821e2ca649929a5562953c0466cd44736", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["tur-Latn"], + main_score="accuracy", + date=("2013-01-01", "2013-08-11"), + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{Demirtas2013CrosslingualPD, + author = {Erkin Demirtas and Mykola Pechenizkiy}, + booktitle = {wisdom}, + title = {Cross-lingual polarity detection with machine translation}, + url = {https://api.semanticscholar.org/CorpusID:3912960}, + year = {2013}, +} +""", + adapted_from=["TurkishProductSentimentClassification"], + ) diff --git a/mteb/tasks/Classification/ukr/UkrFormalityClassification.py b/mteb/tasks/Classification/ukr/UkrFormalityClassification.py index fadc60edd8..aedd65910b 100644 --- a/mteb/tasks/Classification/ukr/UkrFormalityClassification.py +++ b/mteb/tasks/Classification/ukr/UkrFormalityClassification.py @@ -5,6 +5,7 @@ class UkrFormalityClassification(AbsTaskClassification): + superseded_by = "UkrFormalityClassification.v2" metadata = TaskMetadata( name="UkrFormalityClassification", description=""" @@ -52,3 +53,53 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["train", "test"] ) + + +class UkrFormalityClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="UkrFormalityClassification.v2", + description=""" + This dataset contains Ukrainian Formality Classification dataset obtained by + trainslating English GYAFC data. + English data source: https://aclanthology.org/N18-1012/ + Translation into Ukrainian language using model: https://huggingface.co/facebook/nllb-200-distilled-600M + Additionally, the dataset was balanced, witha labels: 0 - informal, 1 - formal. + + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + dataset={ + "path": "mteb/ukr_formality", + "revision": "e0b2dfa57d505f207deb571e58b0bd0b81180bd4", + }, + reference="https://huggingface.co/datasets/ukr-detect/ukr-formality-dataset-translated-gyafc", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["ukr-Cyrl"], + main_score="accuracy", + date=("2018-04-11", "2018-06-20"), + domains=["News", "Written"], + task_subtypes=["Topic classification"], + license="openrail++", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated", + bibtex_citation=r""" +@inproceedings{rao-tetreault-2018-dear, + author = {Rao, Sudha and +Tetreault, Joel}, + booktitle = {Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)}, + month = jun, + publisher = {Association for Computational Linguistics}, + title = {Dear Sir or Madam, May {I} Introduce the {GYAFC} Dataset: Corpus, Benchmarks and Metrics for Formality Style Transfer}, + url = {https://aclanthology.org/N18-1012}, + year = {2018}, +} +""", + adapted_from=["UkrFormalityClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["train", "test"] + ) diff --git a/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py b/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py index b6555c9f08..a3d116f807 100644 --- a/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py +++ b/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py @@ -5,6 +5,7 @@ class UrduRomanSentimentClassification(AbsTaskClassification): + superseded_by = "UrduRomanSentimentClassification.v2" metadata = TaskMetadata( name="UrduRomanSentimentClassification", description="The Roman Urdu dataset is a data corpus comprising of more than 20000 records tagged for sentiment (Positive, Negative, Neutral)", @@ -36,3 +37,39 @@ class UrduRomanSentimentClassification(AbsTaskClassification): } """, ) + + +class UrduRomanSentimentClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="UrduRomanSentimentClassification.v2", + description="""The Roman Urdu dataset is a data corpus comprising of more than 20000 records tagged for sentiment (Positive, Negative, Neutral) + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://archive.ics.uci.edu/dataset/458/roman+urdu+data+set", + dataset={ + "path": "mteb/urdu_roman_sentiment", + "revision": "fe3ea6b93097e7a2eb1356ad3665fd01667ac6be", + }, + type="Classification", + category="s2s", + modalities=["text"], + date=("2018-01-01", "2018-08-28"), + eval_splits=["test"], + eval_langs=["urd-Latn"], + main_score="f1", + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="mit", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{misc_roman_urdu_data_set_458, + author = {Sharf,Zareen}, + howpublished = {UCI Machine Learning Repository}, + note = {{DOI}: https://doi.org/10.24432/C58325}, + title = {{Roman Urdu Data Set}}, + year = {2018}, +} +""", + adapted_from=["UrduRomanSentimentClassification"], + ) diff --git a/mteb/tasks/Classification/vie/AmazonCounterfactualVNClassification.py b/mteb/tasks/Classification/vie/AmazonCounterfactualVNClassification.py new file mode 100644 index 0000000000..540e5f2eda --- /dev/null +++ b/mteb/tasks/Classification/vie/AmazonCounterfactualVNClassification.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class AmazonCounterfactualVNClassification(AbsTaskClassification): + num_samples = 32 + + metadata = TaskMetadata( + name="AmazonCounterfactualVNClassification", + dataset={ + "path": "GreenNode/amazon-counterfactual-vn", + "revision": "b48bc27d383cfca5b6a47135a52390fa5f66b253", + }, + description="""A collection of translated Amazon customer reviews annotated for counterfactual detection pair classification. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria. + """, + reference="https://arxiv.org/abs/2104.06893", + category="s2s", + type="Classification", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Reviews", "Written"], + task_subtypes=["Counterfactual Detection"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["AmazonCounterfactualClassification"], + ) diff --git a/mteb/tasks/Classification/vie/AmazonPolarityVNClassification.py b/mteb/tasks/Classification/vie/AmazonPolarityVNClassification.py new file mode 100644 index 0000000000..e3beceda5c --- /dev/null +++ b/mteb/tasks/Classification/vie/AmazonPolarityVNClassification.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class AmazonPolarityVNClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="AmazonPolarityVNClassification", + description="""A collection of translated Amazon customer reviews annotated for polarity classification. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria. + """, + reference="https://huggingface.co/datasets/amazon_polarity", + dataset={ + "path": "GreenNode/amazon-polarity-vn", + "revision": "4e9a0d6e6bd97ab32f23c50c043d751eed2a5f8a", + }, + type="Classification", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["AmazonPolarityClassification"], + ) diff --git a/mteb/tasks/Classification/vie/AmazonReviewsVNClassification.py b/mteb/tasks/Classification/vie/AmazonReviewsVNClassification.py new file mode 100644 index 0000000000..25ad64744e --- /dev/null +++ b/mteb/tasks/Classification/vie/AmazonReviewsVNClassification.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class AmazonReviewsVNClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="AmazonReviewsVNClassification", + dataset={ + "path": "GreenNode/amazon-reviews-multi-vn", + "revision": "27da94deb6d4f44af789a3d70750fa506b79f189", + }, + description="""A collection of translated Amazon reviews specifically designed to aid research in multilingual text classification. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://arxiv.org/abs/2010.02573", + category="s2s", + type="Classification", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Reviews", "Written"], + task_subtypes=["Emotion classification"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["AmazonReviewsClassification"], + ) diff --git a/mteb/tasks/Classification/vie/Banking77VNClassification.py b/mteb/tasks/Classification/vie/Banking77VNClassification.py new file mode 100644 index 0000000000..a051965bca --- /dev/null +++ b/mteb/tasks/Classification/vie/Banking77VNClassification.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class Banking77VNClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="Banking77VNClassification", + description="""A translated dataset composed of online banking queries annotated with their corresponding intents. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://arxiv.org/abs/2003.04807", + dataset={ + "path": "GreenNode/banking77-vn", + "revision": "42541b07c25a49604be129fba6d70b752be229c1", + }, + type="Classification", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["Banking77Classification"], + ) diff --git a/mteb/tasks/Classification/vie/EmotionVNClassification.py b/mteb/tasks/Classification/vie/EmotionVNClassification.py new file mode 100644 index 0000000000..d2a7b44e7a --- /dev/null +++ b/mteb/tasks/Classification/vie/EmotionVNClassification.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class EmotionVNClassification(AbsTaskClassification): + num_samples = 16 + + metadata = TaskMetadata( + name="EmotionVNClassification", + description="""Emotion is a translated dataset of Vietnamese from English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://www.aclweb.org/anthology/D18-1404", + dataset={ + "path": "GreenNode/emotion-vn", + "revision": "797a93c0dd755ebf5818fbf54d0e0a024df9216d", + }, + type="Classification", + category="s2s", + eval_splits=["validation", "test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["EmotionClassification"], + ) diff --git a/mteb/tasks/Classification/vie/ImdbVNClassification.py b/mteb/tasks/Classification/vie/ImdbVNClassification.py new file mode 100644 index 0000000000..9d87ca3c98 --- /dev/null +++ b/mteb/tasks/Classification/vie/ImdbVNClassification.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class ImdbVNClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="ImdbVNClassification", + description="""A translated dataset of large movie reviews annotated for sentiment classification. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + dataset={ + "path": "GreenNode/imdb-vn", + "revision": "0dccb383ee26c90c99d03c8674cf40de642f099a", + }, + reference="http://www.aclweb.org/anthology/P11-1015", + type="Classification", + category="p2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["ImdbClassification"], + ) diff --git a/mteb/tasks/Classification/vie/MTOPDomainVNClassification.py b/mteb/tasks/Classification/vie/MTOPDomainVNClassification.py new file mode 100644 index 0000000000..9050165762 --- /dev/null +++ b/mteb/tasks/Classification/vie/MTOPDomainVNClassification.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class MTOPDomainVNClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="MTOPDomainVNClassification", + dataset={ + "path": "GreenNode/mtop-domain-vn", + "revision": "6e1ec8c54c018151c77472d94b1c0765230cf6ca", + }, + description="""A translated dataset from MTOP: Multilingual Task-Oriented Semantic Parsing + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://arxiv.org/pdf/2008.09335.pdf", + category="s2s", + type="Classification", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Spoken", "Spoken"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["MTOPDomainClassification"], + ) diff --git a/mteb/tasks/Classification/vie/MTOPIntentVNClassification.py b/mteb/tasks/Classification/vie/MTOPIntentVNClassification.py new file mode 100644 index 0000000000..d50b263722 --- /dev/null +++ b/mteb/tasks/Classification/vie/MTOPIntentVNClassification.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class MTOPIntentVNClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="MTOPIntentVNClassification", + dataset={ + "path": "GreenNode/mtop-intent-vn", + "revision": "c4e81a5c9a813a0142d905e261e5a446cc6fbc4a", + }, + description="""A translated dataset from MTOP: Multilingual Task-Oriented Semantic Parsing + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://arxiv.org/pdf/2008.09335.pdf", + category="s2s", + type="Classification", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Spoken", "Spoken"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["MTOPIntentClassification"], + ) diff --git a/mteb/tasks/Classification/vie/MassiveIntentVNClassification.py b/mteb/tasks/Classification/vie/MassiveIntentVNClassification.py new file mode 100644 index 0000000000..49e7fe0219 --- /dev/null +++ b/mteb/tasks/Classification/vie/MassiveIntentVNClassification.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class MassiveIntentVNClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="MassiveIntentVNClassification", + dataset={ + "path": "GreenNode/amazon-massive-intent-vn", + "revision": "35c7ced69f958dbbaa24f792db4a9250e461866d", + }, + description="""A translated dataset from MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://arxiv.org/abs/2204.08582#:~:text=MASSIVE%20contains%201M%20realistic%2C%20parallel,diverse%20languages%20from%2029%20genera.", + category="s2s", + type="Classification", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Spoken"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["MassiveIntentClassification"], + ) diff --git a/mteb/tasks/Classification/vie/MassiveScenarioVNClassification.py b/mteb/tasks/Classification/vie/MassiveScenarioVNClassification.py new file mode 100644 index 0000000000..5d214e9fa2 --- /dev/null +++ b/mteb/tasks/Classification/vie/MassiveScenarioVNClassification.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class MassiveScenarioVNClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="MassiveScenarioVNClassification", + dataset={ + "path": "GreenNode/amazon-massive-scenario-vn", + "revision": "a82e282d9f5aec1a8cf7d868ce40f70669c16b89", + }, + description="""A translated dataset from MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://arxiv.org/abs/2204.08582#:~:text=MASSIVE%20contains%201M%20realistic%2C%20parallel,diverse%20languages%20from%2029%20genera.", + category="s2s", + type="Classification", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Spoken"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["MassiveScenarioClassification"], + ) diff --git a/mteb/tasks/Classification/vie/ToxicConversationsVNClassification.py b/mteb/tasks/Classification/vie/ToxicConversationsVNClassification.py new file mode 100644 index 0000000000..a3fb2d6d13 --- /dev/null +++ b/mteb/tasks/Classification/vie/ToxicConversationsVNClassification.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class ToxicConversationsVNClassification(AbsTaskClassification): + num_samples = 16 + + metadata = TaskMetadata( + name="ToxicConversationsVNClassification", + description="""A translated dataset from Collection of comments from the Civil Comments platform together with annotations if the comment is toxic or not. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/overview", + dataset={ + "path": "GreenNode/toxic-conversations-50k-vn", + "revision": "2cc697991407cbbe34e7ef7bc9564449a4a99132", + }, + type="Classification", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["ToxicConversationsClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/mteb/tasks/Classification/vie/TweetSentimentExtractionVNClassification.py b/mteb/tasks/Classification/vie/TweetSentimentExtractionVNClassification.py new file mode 100644 index 0000000000..1645c182bd --- /dev/null +++ b/mteb/tasks/Classification/vie/TweetSentimentExtractionVNClassification.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class TweetSentimentExtractionVNClassification(AbsTaskClassification): + num_samples = 32 + + metadata = TaskMetadata( + name="TweetSentimentExtractionVNClassification", + description="""A collection of translated tweets annotated for sentiment extraction. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://www.kaggle.com/competitions/tweet-sentiment-extraction/overview", + dataset={ + "path": "GreenNode/tweet-sentiment-extraction-vn", + "revision": "f453803eff1e91579eb235dc1d7c38d39b3f1340", + }, + type="Classification", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["TweetSentimentExtractionClassification"], + ) diff --git a/mteb/tasks/Classification/vie/VieStudentFeedbackClassification.py b/mteb/tasks/Classification/vie/VieStudentFeedbackClassification.py index 8d40b89ff8..d4a2eee070 100644 --- a/mteb/tasks/Classification/vie/VieStudentFeedbackClassification.py +++ b/mteb/tasks/Classification/vie/VieStudentFeedbackClassification.py @@ -7,6 +7,7 @@ class VieStudentFeedbackClassification(AbsTaskClassification): + superseded_by = "VieStudentFeedbackClassification.v2" metadata = TaskMetadata( name="VieStudentFeedbackClassification", description="A Vietnamese dataset for classification of student feedback", @@ -50,3 +51,47 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["test"] ) + + +class VieStudentFeedbackClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="VieStudentFeedbackClassification.v2", + description="""A Vietnamese dataset for classification of student feedback + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://ieeexplore.ieee.org/document/8573337", + dataset={ + "path": "mteb/vie_student_feedback", + "revision": "9f9451c4aaaa5bf528a90fd430afa128fa748e45", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2021-12-26", "2021-12-26"), + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="mit", + annotations_creators="human-annotated", + dialect=[], + sample_creation="created", + bibtex_citation=r""" +@inproceedings{8573337, + author = {Nguyen, Kiet Van and Nguyen, Vu Duc and Nguyen, Phu X. V. and Truong, Tham T. H. and Nguyen, Ngan Luu-Thuy}, + booktitle = {2018 10th International Conference on Knowledge and Systems Engineering (KSE)}, + doi = {10.1109/KSE.2018.8573337}, + number = {}, + pages = {19-24}, + title = {UIT-VSFC: Vietnamese Students’ Feedback Corpus for Sentiment Analysis}, + volume = {}, + year = {2018}, +} +""", + adapted_from=["VieStudentFeedbackClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/mteb/tasks/Classification/zho/CMTEBClassification.py b/mteb/tasks/Classification/zho/CMTEBClassification.py index 64fb95298a..58c42c3ed6 100644 --- a/mteb/tasks/Classification/zho/CMTEBClassification.py +++ b/mteb/tasks/Classification/zho/CMTEBClassification.py @@ -5,6 +5,7 @@ class TNews(AbsTaskClassification): + superseded_by = "TNews.v2" metadata = TaskMetadata( name="TNews", description="Short Text Classification for News", @@ -77,7 +78,83 @@ class TNews(AbsTaskClassification): samples_per_label = 32 +class TNewsV2(AbsTaskClassification): + metadata = TaskMetadata( + name="TNews.v2", + description="""Short Text Classification for News + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://www.cluebenchmarks.com/introduce.html", + dataset={ + "path": "mteb/t_news", + "revision": "0b80e40cb6a16956286e0dcbd4647a515cd277c4", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["validation"], + eval_langs=["cmn-Hans"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators=None, + dialect=None, + sample_creation=None, + bibtex_citation=r""" +@inproceedings{xu-etal-2020-clue, + address = {Barcelona, Spain (Online)}, + author = {Xu, Liang and +Hu, Hai and +Zhang, Xuanwei and +Li, Lu and +Cao, Chenjie and +Li, Yudong and +Xu, Yechen and +Sun, Kai and +Yu, Dian and +Yu, Cong and +Tian, Yin and +Dong, Qianqian and +Liu, Weitang and +Shi, Bo and +Cui, Yiming and +Li, Junyi and +Zeng, Jun and +Wang, Rongzhao and +Xie, Weijian and +Li, Yanting and +Patterson, Yina and +Tian, Zuoyu and +Zhang, Yiwen and +Zhou, He and +Liu, Shaoweihua and +Zhao, Zhe and +Zhao, Qipeng and +Yue, Cong and +Zhang, Xinrui and +Yang, Zhengliang and +Richardson, Kyle and +Lan, Zhenzhong }, + booktitle = {Proceedings of the 28th International Conference on Computational Linguistics}, + doi = {10.18653/v1/2020.coling-main.419}, + month = dec, + pages = {4762--4772}, + publisher = {International Committee on Computational Linguistics}, + title = {{CLUE}: A {C}hinese Language Understanding Evaluation Benchmark}, + url = {https://aclanthology.org/2020.coling-main.419}, + year = {2020}, +} +""", + prompt="Classify the fine-grained category of the given news title", + adapted_from=["TNews"], + ) + + samples_per_label = 32 + + class IFlyTek(AbsTaskClassification): + superseded_by = "IFlyTek.v2" metadata = TaskMetadata( name="IFlyTek", description="Long Text classification for the description of Apps", @@ -157,7 +234,90 @@ def metadata_dict(self) -> dict[str, str]: return metadata_dict +class IFlyTekV2(AbsTaskClassification): + metadata = TaskMetadata( + name="IFlyTek.v2", + description="""Long Text classification for the description of Apps + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://www.cluebenchmarks.com/introduce.html", + dataset={ + "path": "mteb/i_fly_tek", + "revision": "a435e336f513cbd9175503bf156bcdbbdaae7682", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["validation"], + eval_langs=["cmn-Hans"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators=None, + dialect=None, + sample_creation=None, + bibtex_citation=r""" +@inproceedings{xu-etal-2020-clue, + abstract = {The advent of natural language understanding (NLU) benchmarks for English, such as GLUE and SuperGLUE allows new NLU models to be evaluated across a diverse set of tasks. These comprehensive benchmarks have facilitated a broad range of research and applications in natural language processing (NLP). The problem, however, is that most such benchmarks are limited to English, which has made it difficult to replicate many of the successes in English NLU for other languages. To help remedy this issue, we introduce the first large-scale Chinese Language Understanding Evaluation (CLUE) benchmark. CLUE is an open-ended, community-driven project that brings together 9 tasks spanning several well-established single-sentence/sentence-pair classification tasks, as well as machine reading comprehension, all on original Chinese text. To establish results on these tasks, we report scores using an exhaustive set of current state-of-the-art pre-trained Chinese models (9 in total). We also introduce a number of supplementary datasets and additional tools to help facilitate further progress on Chinese NLU. Our benchmark is released at https://www.cluebenchmarks.com}, + address = {Barcelona, Spain (Online)}, + author = {Xu, Liang and +Hu, Hai and +Zhang, Xuanwei and +Li, Lu and +Cao, Chenjie and +Li, Yudong and +Xu, Yechen and +Sun, Kai and +Yu, Dian and +Yu, Cong and +Tian, Yin and +Dong, Qianqian and +Liu, Weitang and +Shi, Bo and +Cui, Yiming and +Li, Junyi and +Zeng, Jun and +Wang, Rongzhao and +Xie, Weijian and +Li, Yanting and +Patterson, Yina and +Tian, Zuoyu and +Zhang, Yiwen and +Zhou, He and +Liu, Shaoweihua and +Zhao, Zhe and +Zhao, Qipeng and +Yue, Cong and +Zhang, Xinrui and +Yang, Zhengliang and +Richardson, Kyle and +Lan, Zhenzhong }, + booktitle = {Proceedings of the 28th International Conference on Computational Linguistics}, + doi = {10.18653/v1/2020.coling-main.419}, + month = dec, + pages = {4762--4772}, + publisher = {International Committee on Computational Linguistics}, + title = {{CLUE}: A {C}hinese Language Understanding Evaluation Benchmark}, + url = {https://aclanthology.org/2020.coling-main.419}, + year = {2020}, +} +""", + prompt="Given an App description text, find the appropriate fine-grained category", + adapted_from=["IFlyTek"], + ) + + samples_per_label = 32 + + @property + def metadata_dict(self) -> dict[str, str]: + metadata_dict = super().metadata_dict + metadata_dict["n_experiments"] = 5 + return metadata_dict + + class MultilingualSentiment(AbsTaskClassification): + superseded_by = "MultilingualSentiment.v2" metadata = TaskMetadata( name="MultilingualSentiment", description="A collection of multilingual sentiments datasets grouped into 3 classes -- positive, neutral, negative", @@ -186,7 +346,39 @@ class MultilingualSentiment(AbsTaskClassification): samples_per_label = 32 +class MultilingualSentimentV2(AbsTaskClassification): + metadata = TaskMetadata( + name="MultilingualSentiment.v2", + description="""A collection of multilingual sentiments datasets grouped into 3 classes -- positive, neutral, negative + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://github.com/tyqiangz/multilingual-sentiment-datasets", + dataset={ + "path": "mteb/multilingual_sentiment", + "revision": "0b29d56f2b01f431e809942450b8cb7c9a496b99", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["validation", "test"], + eval_langs=["cmn-Hans"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators=None, + dialect=None, + sample_creation=None, + bibtex_citation=None, + prompt="Classify sentiment of the customer review into positive, neutral, or negative", + adapted_from=["MultilingualSentiment"], + ) + + samples_per_label = 32 + + class JDReview(AbsTaskClassification): + superseded_by = "JDReview.v2" metadata = TaskMetadata( name="JDReview", description="review for iphone", @@ -222,6 +414,44 @@ class JDReview(AbsTaskClassification): samples_per_label = 32 +class JDReviewV2(AbsTaskClassification): + metadata = TaskMetadata( + name="JDReview.v2", + description="""review for iphone + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://aclanthology.org/2023.nodalida-1.20/", + dataset={ + "path": "mteb/jd_review", + "revision": "43fcb7f8f4079c749f748e966e485634c65e6ae4", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["cmn-Hans"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators=None, + dialect=None, + sample_creation=None, + bibtex_citation=r""" +@article{xiao2023c, + author = {Xiao, Shitao and Liu, Zheng and Zhang, Peitian and Muennighof, Niklas}, + journal = {arXiv preprint arXiv:2309.07597}, + title = {C-pack: Packaged resources to advance general chinese embedding}, + year = {2023}, +} +""", + prompt="Classify the customer review for iPhone on e-commerce platform into positive or negative", + adapted_from=["JDReview"], + ) + + samples_per_label = 32 + + class OnlineShopping(AbsTaskClassification): metadata = TaskMetadata( name="OnlineShopping", @@ -259,6 +489,7 @@ class OnlineShopping(AbsTaskClassification): class Waimai(AbsTaskClassification): + superseded_by = "Waimai.v2" metadata = TaskMetadata( name="Waimai", description="Sentiment Analysis of user reviews on takeaway platforms", @@ -292,3 +523,41 @@ class Waimai(AbsTaskClassification): ) samples_per_label = 32 + + +class WaimaiV2(AbsTaskClassification): + metadata = TaskMetadata( + name="Waimai.v2", + description="""Sentiment Analysis of user reviews on takeaway platforms + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://aclanthology.org/2023.nodalida-1.20/", + dataset={ + "path": "mteb/waimai", + "revision": "29d99f78f6f1d577e1c28a097883c12a4dc88283", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["cmn-Hans"], + main_score="accuracy", + date=None, + domains=None, + task_subtypes=None, + license=None, + annotations_creators=None, + dialect=None, + sample_creation=None, + bibtex_citation=r""" +@article{xiao2023c, + author = {Xiao, Shitao and Liu, Zheng and Zhang, Peitian and Muennighof, Niklas}, + journal = {arXiv preprint arXiv:2309.07597}, + title = {C-pack: Packaged resources to advance general chinese embedding}, + year = {2023}, +} +""", + prompt="Classify the customer review from a food takeaway platform into positive or negative", + adapted_from=["Waimai"], + ) + + samples_per_label = 32 diff --git a/mteb/tasks/Classification/zho/YueOpenriceReviewClassification.py b/mteb/tasks/Classification/zho/YueOpenriceReviewClassification.py index 7c6134a731..76f187f4a4 100644 --- a/mteb/tasks/Classification/zho/YueOpenriceReviewClassification.py +++ b/mteb/tasks/Classification/zho/YueOpenriceReviewClassification.py @@ -5,6 +5,7 @@ class YueOpenriceReviewClassification(AbsTaskClassification): + superseded_by = "YueOpenriceReviewClassification.v2" metadata = TaskMetadata( name="YueOpenriceReviewClassification", description="A Cantonese dataset for review classification", @@ -44,3 +45,47 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, splits=["test"] ) + + +class YueOpenriceReviewClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="YueOpenriceReviewClassification.v2", + description="""A Cantonese dataset for review classification + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://github.com/Christainx/Dataset_Cantonese_Openrice", + dataset={ + "path": "mteb/yue_openrice_review", + "revision": "702b7ebe3b3ac712f1c31e87ab7171b1f1ca6b6b", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["yue-Hant"], + main_score="accuracy", + date=("2019-01-01", "2019-05-01"), + domains=["Reviews", "Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{xiang2019sentiment, + author = {Xiang, Rong and Jiao, Ying and Lu, Qin}, + booktitle = {Proceedings of the 8th KDD Workshop on Issues of Sentiment Discovery and Opinion Mining (WISDOM)}, + organization = {KDD WISDOM}, + pages = {1--9}, + title = {Sentiment Augmented Attention Network for Cantonese Restaurant Review Analysis}, + year = {2019}, +} +""", + adapted_from=["YueOpenriceReviewClassification"], + ) + + samples_per_label = 32 + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/mteb/tasks/Classification/zul/IsiZuluNewsClassification.py b/mteb/tasks/Classification/zul/IsiZuluNewsClassification.py index f8ca8c8e36..60c1dc8736 100644 --- a/mteb/tasks/Classification/zul/IsiZuluNewsClassification.py +++ b/mteb/tasks/Classification/zul/IsiZuluNewsClassification.py @@ -5,6 +5,7 @@ class IsiZuluNewsClassification(AbsTaskClassification): + superseded_by = "IsiZuluNewsClassification.v2" metadata = TaskMetadata( name="IsiZuluNewsClassification", description="isiZulu News Classification Dataset", @@ -41,3 +42,41 @@ class IsiZuluNewsClassification(AbsTaskClassification): def dataset_transform(self): self.dataset = self.dataset.rename_columns({"title": "text"}) + + +class IsiZuluNewsClassificationV2(AbsTaskClassification): + metadata = TaskMetadata( + name="IsiZuluNewsClassification.v2", + description="""isiZulu News Classification Dataset + This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""", + reference="https://huggingface.co/datasets/dsfsi/za-isizulu-siswati-news", + dataset={ + "path": "mteb/isi_zulu_news", + "revision": "45708aaaf9c6133227ea5db5cf26571facb9ccdb", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["zul-Latn"], + main_score="accuracy", + date=("2022-08-01", "2022-08-01"), + domains=["News", "Written"], + task_subtypes=["Topic classification"], + license="cc-by-sa-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{Madodonga_Marivate_Adendorff_2023, + author = {Madodonga, Andani and Marivate, Vukosi and Adendorff, Matthew}, + doi = {10.55492/dhasa.v4i01.4449}, + month = {Jan.}, + title = {Izindaba-Tindzaba: Machine learning news categorisation for Long and Short Text for isiZulu and Siswati}, + url = {https://upjournals.up.ac.za/index.php/dhasa/article/view/4449}, + volume = {4}, + year = {2023}, +} +""", + adapted_from=["IsiZuluNewsClassification"], + ) diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py index 0c86095000..3476ea491a 100644 --- a/mteb/tasks/Clustering/__init__.py +++ b/mteb/tasks/Clustering/__init__.py @@ -50,4 +50,9 @@ from .spa.SpanishNewsClusteringP2P import * from .swe.swedn_clustering import * from .swe.SwednClustering import * +from .vie.RedditClusteringP2PVN import * +from .vie.RedditClusteringVN import * +from .vie.StackExchangeClusteringP2PVN import * +from .vie.StackExchangeClusteringVN import * +from .vie.TwentyNewsgroupsClusteringVN import * from .zho.CMTEBClustering import * diff --git a/mteb/tasks/Clustering/vie/RedditClusteringP2PVN.py b/mteb/tasks/Clustering/vie/RedditClusteringP2PVN.py new file mode 100644 index 0000000000..6f200ca758 --- /dev/null +++ b/mteb/tasks/Clustering/vie/RedditClusteringP2PVN.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class RedditClusteringP2PVN(AbsTaskClustering): + metadata = TaskMetadata( + name="RedditClusteringP2P-VN", + description="""A translated dataset from Clustering of title+posts from reddit. Clustering of 10 sets of 50k paragraphs and 40 sets of 10k paragraphs. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://arxiv.org/abs/2104.07081", + dataset={ + "path": "GreenNode/reddit-clustering-p2p-vn", + "revision": "841856dcb82496f1f2f59356e4798ce15baeb200", + }, + type="Clustering", + category="p2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="v_measure", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Web", "Social", "Written"], + task_subtypes=["Thematic clustering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["RedditClusteringP2P"], + ) diff --git a/mteb/tasks/Clustering/vie/RedditClusteringVN.py b/mteb/tasks/Clustering/vie/RedditClusteringVN.py new file mode 100644 index 0000000000..0bb1cf95ec --- /dev/null +++ b/mteb/tasks/Clustering/vie/RedditClusteringVN.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class RedditClusteringVN(AbsTaskClustering): + metadata = TaskMetadata( + name="RedditClustering-VN", + description="""A translated dataset from Clustering of titles from 199 subreddits. Clustering of 25 sets, each with 10-50 classes, and each class with 100 - 1000 sentences. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://arxiv.org/abs/2104.07081", + dataset={ + "path": "GreenNode/reddit-clustering-vn", + "revision": "7f7d4097979633181b2df3f73905218f74c4665d", + }, + type="Clustering", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="v_measure", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Web", "Social", "Written"], + task_subtypes=["Thematic clustering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["RedditClustering"], + ) diff --git a/mteb/tasks/Clustering/vie/StackExchangeClusteringP2PVN.py b/mteb/tasks/Clustering/vie/StackExchangeClusteringP2PVN.py new file mode 100644 index 0000000000..24e578deb9 --- /dev/null +++ b/mteb/tasks/Clustering/vie/StackExchangeClusteringP2PVN.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class StackExchangeClusteringP2PVN(AbsTaskClustering): + metadata = TaskMetadata( + name="StackExchangeClusteringP2P-VN", + description="""A translated Clustering of title+body from stackexchange. Clustering of 5 sets of 10k paragraphs and 5 sets of 5k paragraphs. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://arxiv.org/abs/2104.07081", + dataset={ + "path": "GreenNode/stackexchange-clustering-p2p-vn", + "revision": "8f154ee524a466850028531d21e1a62d958b8156", + }, + type="Clustering", + category="p2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="v_measure", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Web", "Written"], + task_subtypes=["Thematic clustering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["StackExchangeClusteringP2P"], + ) diff --git a/mteb/tasks/Clustering/vie/StackExchangeClusteringVN.py b/mteb/tasks/Clustering/vie/StackExchangeClusteringVN.py new file mode 100644 index 0000000000..d476c41de1 --- /dev/null +++ b/mteb/tasks/Clustering/vie/StackExchangeClusteringVN.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class StackExchangeClusteringVN(AbsTaskClustering): + metadata = TaskMetadata( + name="StackExchangeClustering-VN", + description="""A translated dataset from Clustering of titles from 121 stackexchanges. Clustering of 25 sets, each with 10-50 classes, and each class with 100 - 1000 sentences. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://arxiv.org/abs/2104.07081", + dataset={ + "path": "GreenNode/stackexchange-clustering-vn", + "revision": "cf01db048f2bf705741675b51613dc48e0bb122b", + }, + type="Clustering", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="v_measure", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Web", "Written"], + task_subtypes=["Thematic clustering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["StackExchangeClustering"], + ) diff --git a/mteb/tasks/Clustering/vie/TwentyNewsgroupsClusteringVN.py b/mteb/tasks/Clustering/vie/TwentyNewsgroupsClusteringVN.py new file mode 100644 index 0000000000..d45be8112d --- /dev/null +++ b/mteb/tasks/Clustering/vie/TwentyNewsgroupsClusteringVN.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class TwentyNewsgroupsClusteringVN(AbsTaskClustering): + metadata = TaskMetadata( + name="TwentyNewsgroupsClustering-VN", + description="""A translated dataset from Clustering of the 20 Newsgroups dataset (subject only). + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html", + dataset={ + "path": "GreenNode/twentynewsgroups-clustering-vn", + "revision": "770e1b9029cd85c79410bc6df1528a43fc2b9ad1", + }, + type="Clustering", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="v_measure", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["News", "Written"], + task_subtypes=["Thematic clustering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["TwentyNewsgroupsClustering"], + ) diff --git a/mteb/tasks/Clustering/vie/__init__.py b/mteb/tasks/Clustering/vie/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/Image/Any2AnyRetrieval/__init__.py b/mteb/tasks/Image/Any2AnyRetrieval/__init__.py index b11aaa4da1..501b496fd2 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/__init__.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/__init__.py @@ -46,6 +46,7 @@ from .eng.VQA2IT2TRetrieval import * from .eng.WebQAT2ITRetrieval import * from .eng.WebQAT2TRetrieval import * +from .multilingual.JinaVDRBenchRetrieval import * from .multilingual.MIRACLVisionRetrieval import * from .multilingual.VdrMultilingualRetrieval import * from .multilingual.Vidore2BenchRetrieval import * diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/JinaVDRBenchRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/JinaVDRBenchRetrieval.py new file mode 100644 index 0000000000..d08ce5717e --- /dev/null +++ b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/JinaVDRBenchRetrieval.py @@ -0,0 +1,1126 @@ +from __future__ import annotations + +from collections import defaultdict + +from datasets import load_dataset + +from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +_LANGS = { + "ar": ["ara-Arab"], + "bn": ["ben-Beng"], + "de": ["deu-Latn"], + "en": ["eng-Latn"], + "es": ["spa-Latn"], + "fr": ["fra-Latn"], + "hi": ["hin-Deva"], + "hu": ["hun-Latn"], + "id": ["ind-Latn"], + "it": ["ita-Latn"], + "jp": ["jpn-Jpan"], + "ko": ["kor-Hang"], + "my": ["mya-Mymr"], + "nl": ["nld-Latn"], + "pl": ["pol-Latn"], + "pt": ["por-Latn"], + "ru": ["rus-Cyrl"], + "th": ["tha-Thai"], + "tr": ["tur-Latn"], + "ur": ["urd-Arab"], + "vi": ["vie-Latn"], + "zh": ["zho-Hans"], +} + + +def get_langs(langs: list[str]) -> dict[str, list[str]]: + return {lang: _LANGS[lang] for lang in langs} + + +COMMON_METADATA = { + "type": "DocumentUnderstanding", + "category": "t2i", + "eval_splits": ["test"], + "main_score": "ndcg_at_5", + "task_subtypes": ["Image Text Retrieval"], + "dialect": [], + "modalities": ["text", "image"], + "bibtex_citation": r"""@misc{günther2025jinaembeddingsv4universalembeddingsmultimodal, + archiveprefix = {arXiv}, + author = {Michael Günther and Saba Sturua and Mohammad Kalim Akram and Isabelle Mohr and Andrei Ungureanu and Bo Wang and Sedigheh Eslami and Scott Martens and Maximilian Werk and Nan Wang and Han Xiao}, + eprint = {2506.18902}, + primaryclass = {cs.AI}, + title = {jina-embeddings-v4: Universal Embeddings for Multimodal Multilingual Retrieval}, + url = {https://arxiv.org/abs/2506.18902}, + year = {2025}, +}""", + "prompt": {"query": "Find a screenshot that is relevant to the user's input."}, +} + + +def _load_single_language( + path: str, + split: str, + lang: str | None = None, + cache_dir: str | None = None, + revision: str | None = None, +): + query_ds = load_dataset( + path, + data_dir=f"{lang}/queries" if lang else "queries", + split=split, + cache_dir=cache_dir, + revision=revision, + ) + query_ds = query_ds.map( + lambda x: { + "id": f"query-{split}-{x['query-id']}", + "text": x["query"], + "image": None, + "modality": "text", + }, + remove_columns=["query-id", "query"], + ) + + corpus_ds = load_dataset( + path, + data_dir=f"{lang}/corpus" if lang else "corpus", + split=split, + cache_dir=cache_dir, + revision=revision, + ) + corpus_ds = corpus_ds.map( + lambda x: { + "id": f"corpus-{split}-{x['corpus-id']}", + "text": None, + "modality": "image", + }, + remove_columns=["corpus-id"], + ) + + qrels_ds = load_dataset( + path, + data_dir=f"{lang}/qrels" if lang else "qrels", + split=split, + cache_dir=cache_dir, + revision=revision, + ) + + return query_ds, corpus_ds, qrels_ds + + +def _load_data( + path: str, + splits: str, + langs: list | None = None, + cache_dir: str | None = None, + revision: str | None = None, +): + if langs is None or len(langs) == 1: + corpus = {} + queries = {} + relevant_docs = {} + langs = ["default"] + else: + corpus = {lang: {} for lang in langs} + queries = {lang: {} for lang in langs} + relevant_docs = {lang: {} for lang in langs} + + for split in splits: + for lang in langs: + query_ds, corpus_ds, qrels_ds = _load_single_language( + path=path, + split=split, + lang=lang, + cache_dir=cache_dir, + revision=revision, + ) + + if lang == "default": + queries[split] = query_ds + corpus[split] = corpus_ds + relevant_docs[split] = defaultdict(dict) + for row in qrels_ds: + qid = f"query-{split}-{row['query-id']}" + did = f"corpus-{split}-{row['corpus-id']}" + relevant_docs[split][qid][did] = int(row["score"]) + else: + queries[lang][split] = query_ds + + corpus[lang][split] = corpus_ds + + relevant_docs[lang][split] = defaultdict(dict) + for row in qrels_ds: + qid = f"query-{split}-{row['query-id']}" + did = f"corpus-{split}-{row['corpus-id']}" + relevant_docs[lang][split][qid][did] = int(row["score"]) + + return corpus, queries, relevant_docs + + +def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs = _load_data( + path=self.metadata_dict["dataset"]["path"], + splits=self.metadata_dict["eval_splits"], + langs=self.metadata_dict["eval_langs"], + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata_dict["dataset"]["revision"], + ) + + self.data_loaded = True + + +class JinaVDRMedicalPrescriptionsRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRMedicalPrescriptionsRetrieval", + description="Retrieve medical prescriptions based on templated queries.", + reference="https://huggingface.co/datasets/jinaai/medical-prescriptions_beir", + dataset={ + "path": "jinaai/medical-prescriptions_beir", + "revision": "f27559d1602523e1c6b66c83e68d337f7bb74fe2", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Medical"], + license="not specified", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRStanfordSlideRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRStanfordSlideRetrieval", + description="Retrieve scientific and engineering slides based on human annotated queries.", + reference="https://huggingface.co/datasets/jinaai/stanford_slide_beir", + dataset={ + "path": "jinaai/stanford_slide_beir", + "revision": "6444c24c59dfb271bdc01e0a56292753e196fc98", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Academic"], + license="not specified", + annotations_creators="human-annotated", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRDonutVQAISynHMPRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRDonutVQAISynHMPRetrieval", + description="Retrieve medical records based on templated queries.", + reference="https://huggingface.co/datasets/jinaai/donut_vqa_beir", + dataset={ + "path": "jinaai/donut_vqa_beir", + "revision": "38e38a676202d3d8fd365b152ab7832207a7aa35", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Medical"], + license="not specified", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRTableVQARetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRTableVQARetrieval", + description="Retrieve scientific tables based on LLM generated queries.", + reference="https://huggingface.co/datasets/jinaai/table-vqa_beir", + dataset={ + "path": "jinaai/table-vqa_beir", + "revision": "d60d6d1311296fac106b5c399873539d3d155393", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Academic"], + license="not specified", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRChartQARetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRChartQARetrieval", + description="Retrieve charts based on LLM generated queries.", + reference="https://huggingface.co/datasets/jinaai/ChartQA_beir", + dataset={ + "path": "jinaai/ChartQA_beir", + "revision": "9d9f9fa99f1150b5af04348de90799a24138d46c", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Web"], + license="not specified", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRTQARetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRTQARetrieval", + description="Retrieve textbook pages (images and text) based on LLM generated queries from the text.", + reference="https://huggingface.co/datasets/jinaai/tqa_beir", + dataset={ + "path": "jinaai/tqa_beir", + "revision": "33b48ad357ceffac3488630b6b0f2c86a9386978", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Academic"], + license="cc-by-nc-3.0", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDROpenAINewsRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDROpenAINewsRetrieval", + description="Retrieve news articles from the OpenAI news website based on human annotated queries.", + reference="https://huggingface.co/datasets/jinaai/openai-news_beir", + dataset={ + "path": "jinaai/openai-news_beir", + "revision": "2c2d1f9910abe9093aa6fa82a76ab73dca525cfd", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["News", "Web"], + license="not specified", + annotations_creators="human-annotated", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDREuropeanaDeNewsRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDREuropeanaDeNewsRetrieval", + description="Retrieve German news articles based on LLM generated queries.", + reference="https://huggingface.co/datasets/jinaai/europeana-de-news_beir", + dataset={ + "path": "jinaai/europeana-de-news_beir", + "revision": "bf226830eac4d22a2389cdccafd254bf1bc1bc5f", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["deu-Latn"], + domains=["News"], + license="cc0-1.0", + annotations_creators="LM-generated", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDREuropeanaEsNewsRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDREuropeanaEsNewsRetrieval", + description="Retrieve Spanish news articles based on LLM generated queries.", + reference="https://huggingface.co/datasets/jinaai/europeana-es-news_beir", + dataset={ + "path": "jinaai/europeana-es-news_beir", + "revision": "724aa71a59e6870eccf3d046e08145c61d0620cb", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["spa-Latn"], + domains=["News"], + license="cc0-1.0", + annotations_creators="LM-generated", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDREuropeanaItScansRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDREuropeanaItScansRetrieval", + description="Retrieve Italian historical articles based on LLM generated queries.", + reference="https://huggingface.co/datasets/jinaai/europeana-it-scans_beir", + dataset={ + "path": "jinaai/europeana-it-scans_beir", + "revision": "8907ccacaa9c624218a2153598e57e444c76391e", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["ita-Latn"], + domains=["News"], + license="cc0-1.0", + annotations_creators="LM-generated", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDREuropeanaNlLegalRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDREuropeanaNlLegalRetrieval", + description="Retrieve Dutch historical legal documents based on LLM generated queries.", + reference="https://huggingface.co/datasets/jinaai/europeana-nl-legal_beir", + dataset={ + "path": "jinaai/europeana-nl-legal_beir", + "revision": "f71c665cc5d4d24ee6045717598d1480c5d63bbc", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["nld-Latn"], + domains=["Legal"], + license="cc0-1.0", + annotations_creators="LM-generated", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRHindiGovVQARetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRHindiGovVQARetrieval", + description="Retrieve Hindi government documents based on LLM generated queries.", + reference="https://huggingface.co/datasets/jinaai/hindi-gov-vqa_beir", + dataset={ + "path": "jinaai/hindi-gov-vqa_beir", + "revision": "a1b96978b1ad0c217a62600e0713ce40ea583cde", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["hin-Deva"], + domains=["Government"], + license="not specified", + annotations_creators="LM-generated", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRAutomobileCatelogRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRAutomobileCatelogRetrieval", + description="Retrieve automobile marketing documents based on LLM generated queries.", + reference="https://huggingface.co/datasets/jinaai/automobile_catalogue_jp_beir", + dataset={ + "path": "jinaai/automobile_catalogue_jp_beir", + "revision": "b83ca039723e1c705dbb444147b1fa0cc6358d5f", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["jpn-Jpan"], + domains=["Engineering", "Web"], + license="not specified", + annotations_creators="LM-generated", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRBeveragesCatalogueRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRBeveragesCatalogueRetrieval", + description="Retrieve beverages marketing documents based on LLM generated queries.", + reference="https://huggingface.co/datasets/jinaai/beverages_catalogue_ru_beir", + dataset={ + "path": "jinaai/beverages_catalogue_ru_beir", + "revision": "d1be95f14c1f8eedb0165303943cd5b69402e2b4", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["rus-Cyrl"], + domains=["Web"], + license="not specified", + annotations_creators="LM-generated", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRRamensBenchmarkRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRRamensBenchmarkRetrieval", + description="Retrieve ramen restaurant marketing documents based on LLM generated queries.", + reference="https://huggingface.co/datasets/jinaai/ramen_benchmark_jp_beir", + dataset={ + "path": "jinaai/ramen_benchmark_jp_beir", + "revision": "ed0ca84e0d2441f9af2b6617ebcdbeefe8a65c1b", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["jpn-Jpan"], + domains=["Web"], + license="not specified", + annotations_creators="LM-generated", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRJDocQARetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRJDocQARetrieval", + description="Retrieve Japanese documents in various formats based on human annotated queries.", + reference="https://huggingface.co/datasets/jinaai/jdocqa_beir", + dataset={ + "path": "jinaai/jdocqa_beir", + "revision": "40a4c729550dfb560c479348775bcff99b6be91b", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["jpn-Jpan"], + domains=["Web"], + license="cc-by-4.0", + annotations_creators="LM-generated", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRHungarianDocQARetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRHungarianDocQARetrieval", + description="Retrieve Hungarian documents in various formats based on human annotated queries.", + reference="https://huggingface.co/datasets/jinaai/hungarian_doc_qa_beir", + dataset={ + "path": "jinaai/hungarian_doc_qa_beir", + "revision": "4179a258d99ed8e9cd1fdca76a74484e842412f5", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["hun-Latn"], + domains=["Web"], + license="cc-by-4.0", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRArabicChartQARetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRArabicChartQARetrieval", + description="Retrieve Arabic charts based on queries.", + reference="https://huggingface.co/datasets/jinaai/arabic_chartqa_ar_beir", + dataset={ + "path": "jinaai/arabic_chartqa_ar_beir", + "revision": "13a71ebb8e17fd7d7303a41831ac0092b61ef7c1", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["ara-Arab"], + domains=["Academic"], + license="mit", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRArabicInfographicsVQARetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRArabicInfographicsVQARetrieval", + description="Retrieve Arabic infographics based on queries.", + reference="https://huggingface.co/datasets/jinaai/arabic_infographicsvqa_ar_beir", + dataset={ + "path": "jinaai/arabic_infographicsvqa_ar_beir", + "revision": "a78b0caf95636de35bb147db616181c8d3e5b9d3", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["ara-Arab"], + domains=["Academic"], + license="mit", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDROWIDChartsRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDROWIDChartsRetrieval", + description="Retrieve charts from the OWID dataset based on accompanied text snippets.", + reference="https://huggingface.co/datasets/jinaai/owid_charts_en_beir", + dataset={ + "path": "jinaai/owid_charts_en_beir", + "revision": "cac5a7f322b9baa473bb878ff6dbdda8a52840e9", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Web"], + license="cc-by-4.0", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRMPMQARetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRMPMQARetrieval", + description="Retrieve product manuals based on human annotated queries.", + reference="https://huggingface.co/datasets/jinaai/mpmqa_small_beir", + dataset={ + "path": "jinaai/mpmqa_small_beir", + "revision": "83deed2d9d7e16cb87aef80a419be16733cc954a", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Web"], + license="apache-2.0", + annotations_creators="human-annotated", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRJina2024YearlyBookRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRJina2024YearlyBookRetrieval", + description="Retrieve pages from the 2024 Jina yearbook based on human annotated questions.", + reference="https://huggingface.co/datasets/jinaai/jina_2024_yearly_book_beir", + dataset={ + "path": "jinaai/jina_2024_yearly_book_beir", + "revision": "79cd892d672b0b0f25229a0b57ba893ee6ac69c1", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Web"], + license="apache-2.0", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRWikimediaCommonsMapsRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRWikimediaCommonsMapsRetrieval", + description="Retrieve maps from Wikimedia Commons based on their description.", + reference="https://huggingface.co/datasets/jinaai/wikimedia-commons-maps_beir", + dataset={ + "path": "jinaai/wikimedia-commons-maps_beir", + "revision": "735c932678642e90909126f4d0948cc5fe1f406e", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Web"], + license="multiple", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRPlotQARetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRPlotQARetrieval", + description="Retrieve plots from the PlotQA dataset based on LLM generated queries.", + reference="https://huggingface.co/datasets/jinaai/plotqa_beir", + dataset={ + "path": "jinaai/plotqa_beir", + "revision": "64a321b8bbba18ebe04a9099f4c3485e1c78b583", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Web"], + license="cc-by-4.0", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRMMTabRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRMMTabRetrieval", + description="Retrieve tables from the MMTab dataset based on queries.", + reference="https://huggingface.co/datasets/jinaai/MMTab_beir", + dataset={ + "path": "jinaai/MMTab_beir", + "revision": "59e6a04a93a0eb082e2402717bb768d4b11795c7", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Web"], + license="mit", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRCharXivOCRRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRCharXivOCRRetrieval", + description="Retrieve charts from scientific papers based on human annotated queries.", + reference="https://huggingface.co/datasets/jinaai/CharXiv-en_beir", + dataset={ + "path": "jinaai/CharXiv-en_beir", + "revision": "c38db7d063ee7d0c119eb41932e981943e37f702", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Web"], + license="cc-by-sa-4.0", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRStudentEnrollmentSyntheticRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRStudentEnrollmentSyntheticRetrieval", + description="Retrieve student enrollment data based on templated queries.", + reference="https://huggingface.co/datasets/jinaai/student-enrollment_beir", + dataset={ + "path": "jinaai/student-enrollment_beir", + "revision": "80859af7fc43313b5e6e7bb1087b5c922f030ce1", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Academic"], + license="cc0-1.0", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRGitHubReadmeRetrieval(MultilingualTask, AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRGitHubReadmeRetrieval", + description="Retrieve GitHub readme files based their description.", + reference="https://huggingface.co/datasets/jinaai/github-readme-retrieval-multilingual_beir", + dataset={ + "path": "jinaai/github-readme-retrieval-multilingual_beir", + "revision": "a7b17c2eca814c32b9af6a852a5d6d7b5e6b9165", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=get_langs( + [ + "ar", + "bn", + "de", + "en", + "es", + "fr", + "hi", + "id", + "it", + "jp", + "ko", + "nl", + "pt", + "ru", + "th", + "vi", + "zh", + ] + ), + domains=["Web"], + license="multiple", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRTweetStockSyntheticsRetrieval(MultilingualTask, AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRTweetStockSyntheticsRetrieval", + description="Retrieve rendered tables of stock prices based on templated queries.", + reference="https://huggingface.co/datasets/jinaai/tweet-stock-synthetic-retrieval_beir", + dataset={ + "path": "jinaai/tweet-stock-synthetic-retrieval_beir", + "revision": "955f2c8e171b3d9ff18c8b841cd814649209d4b0", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=get_langs( + ["ar", "de", "en", "es", "fr", "hi", "hu", "jp", "ru", "zh"] + ), + domains=["Social"], + license="not specified", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRAirbnbSyntheticRetrieval(MultilingualTask, AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRAirbnbSyntheticRetrieval", + description="Retrieve rendered tables from Airbnb listings based on templated queries.", + reference="https://huggingface.co/datasets/jinaai/airbnb-synthetic-retrieval_beir", + dataset={ + "path": "jinaai/airbnb-synthetic-retrieval_beir", + "revision": "14c4c816fff158d20719bebf414d495efeaedc20", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=get_langs( + ["ar", "de", "en", "es", "fr", "hi", "hu", "jp", "ru", "zh"] + ), + domains=["Web"], + license="cc0-1.0", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRShanghaiMasterPlanRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRShanghaiMasterPlanRetrieval", + description="Retrieve pages from the Shanghai Master Plan based on human annotated queries.", + reference="https://huggingface.co/datasets/jinaai/shanghai_master_plan_beir", + dataset={ + "path": "jinaai/shanghai_master_plan_beir", + "revision": "ba711c07aafbe43ef7970cf9429109fc6220c824", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["zho-Hans"], + domains=["Web"], + license="not specified", + annotations_creators="human-annotated", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRWikimediaCommonsDocumentsRetrieval( + MultilingualTask, AbsTaskAny2AnyRetrieval +): + metadata = TaskMetadata( + name="JinaVDRWikimediaCommonsDocumentsRetrieval", + description="Retrieve historical documents from Wikimedia Commons based on their description.", + reference="https://huggingface.co/datasets/jinaai/wikimedia-commons-documents-ml_beir", + dataset={ + "path": "jinaai/wikimedia-commons-documents-ml_beir", + "revision": "1307839f4deabc1dfa954ef6843ef4cf4fc038b8", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=get_langs( + [ + "ar", + "bn", + "de", + "en", + "es", + "fr", + "hi", + "hu", + "id", + "it", + "jp", + "ko", + "my", + "nl", + "pt", + "ru", + "th", + "ur", + "vi", + "zh", + ] + ), + domains=["Web"], + license="multiple", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDREuropeanaFrNewsRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDREuropeanaFrNewsRetrieval", + description="Retrieve French news articles from Europeana based on LLM generated queries.", + reference="https://huggingface.co/datasets/jinaai/europeana-fr-news_beir", + dataset={ + "path": "jinaai/europeana-fr-news_beir", + "revision": "3abc89102ab1d64d02806ba612e7286d63624c01", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["fra-Latn"], + domains=["News"], + license="cc0-1.0", + annotations_creators="LM-generated", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRDocQAHealthcareIndustryRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRDocQAHealthcareIndustryRetrieval", + description="Retrieve healthcare industry documents based on LLM generated queries.", + reference="https://huggingface.co/datasets/jinaai/docqa_healthcare_industry_beir", + dataset={ + "path": "jinaai/docqa_healthcare_industry_beir", + "revision": "810989fee9624ef58b3522c20e00c55d9fc69002", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Medical"], + license="mit", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRDocQAAI(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRDocQAAI", + description="Retrieve AI documents based on LLM generated queries.", + reference="https://huggingface.co/datasets/jinaai/docqa_artificial_intelligence_beir", + dataset={ + "path": "jinaai/docqa_artificial_intelligence_beir", + "revision": "9764d3c6b9b946b2b6302719e4a89bc99c83f975", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Web"], + license="mit", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRShiftProjectRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRShiftProjectRetrieval", + description="Retrieve documents with graphs from the Shift Project based on LLM generated queries.", + reference="https://huggingface.co/datasets/jinaai/shiftproject_beir", + dataset={ + "path": "jinaai/shiftproject_beir", + "revision": "c97b12a93e714c7c3eebea80888ab83483803028", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Web"], + license="mit", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRTatQARetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRTatQARetrieval", + description="Retrieve financial reports based on human annotated queries.", + reference="https://huggingface.co/datasets/jinaai/tatqa_beir", + dataset={ + "path": "jinaai/tatqa_beir", + "revision": "78b7f06bd45d8cead8a61ec83ed20d7eb3c0f82a", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Web"], + license="mit", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRInfovqaRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRInfovqaRetrieval", + description="Retrieve infographics based on human annotated queries.", + reference="https://huggingface.co/datasets/jinaai/infovqa_beir", + dataset={ + "path": "jinaai/infovqa_beir", + "revision": "682247a4c07b5f9da329b2e29fb57c87efd26a3f", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Web"], + license="mit", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRDocVQARetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRDocVQARetrieval", + description="Retrieve industry documents based on human annotated queries.", + reference="https://huggingface.co/datasets/jinaai/docvqa_beir", + dataset={ + "path": "jinaai/docvqa_beir", + "revision": "d77d5d00a0047597a0ffc1ed25555078710e21b4", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Web"], + license="cc-by-4.0", + annotations_creators="LM-generated", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRDocQAGovReportRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRDocQAGovReportRetrieval", + description="Retrieve government reports based on LLM generated queries.", + reference="https://huggingface.co/datasets/jinaai/docqa_gov_report_beir", + dataset={ + "path": "jinaai/docqa_gov_report_beir", + "revision": "76fd0c09bff018c2d503d4f50f0d3ddb68690af0", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Government"], + license="mit", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRTabFQuadRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRTabFQuadRetrieval", + description="Retrieve tables from industry documents based on LLM generated queries.", + reference="https://huggingface.co/datasets/jinaai/tabfquad_beir", + dataset={ + "path": "jinaai/tabfquad_beir", + "revision": "42f9ba0b1f1dd0a6b82be6e9547367e2fb555e21", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Academic"], + license="mit", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRDocQAEnergyRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRDocQAEnergyRetrieval", + description="Retrieve energy industry documents based on LLM generated queries.", + reference="https://huggingface.co/datasets/jinaai/docqa_energy_beir", + dataset={ + "path": "jinaai/docqa_energy_beir", + "revision": "25dac6859e7f5b7e0c309b6286534794b7d05a6c", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Web"], + license="mit", + annotations_creators="derived", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data + + +class JinaVDRArxivQARetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="JinaVDRArxivQARetrieval", + description="Retrieve figures from scientific papers from arXiv based on LLM generated queries.", + reference="https://huggingface.co/datasets/jinaai/arxivqa_beir", + dataset={ + "path": "jinaai/arxivqa_beir", + "revision": "d49798d601d4c53a1d15054acecf25f629f504f4", + }, + date=("2024-10-01", "2025-04-01"), + eval_langs=["eng-Latn"], + domains=["Web"], + license="cc-by-4.0", + annotations_creators="LM-generated", + sample_creation="found", + **COMMON_METADATA, + ) + + load_data = load_data diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index d3ecd19272..62e9d8daa2 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -28,4 +28,7 @@ from .por.Assin2RTE import * from .por.SickBrPC import * from .rus.TERRa import * +from .vie.SprintDuplicateQuestionsPCVN import * +from .vie.TwitterSemEval2015PCVN import * +from .vie.TwitterURLCorpusPCVN import * from .zho.CMTEBPairClassification import * diff --git a/mteb/tasks/PairClassification/vie/SprintDuplicateQuestionsPCVN.py b/mteb/tasks/PairClassification/vie/SprintDuplicateQuestionsPCVN.py new file mode 100644 index 0000000000..7db90a34b9 --- /dev/null +++ b/mteb/tasks/PairClassification/vie/SprintDuplicateQuestionsPCVN.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SprintDuplicateQuestionsPCVN(AbsTaskPairClassification): + metadata = TaskMetadata( + name="SprintDuplicateQuestions-VN", + description="""A translated dataset from Duplicate questions from the Sprint community. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://www.aclweb.org/anthology/D18-1131/", + dataset={ + "path": "GreenNode/sprintduplicatequestions-pairclassification-vn", + "revision": "2552beae0e4fe7fe05d088814f78a4c309ad2219", + }, + type="PairClassification", + category="s2s", + eval_splits=["validation", "test"], + eval_langs=["vie-Latn"], + main_score="max_ap", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Programming", "Written"], + task_subtypes=["Duplicate Detection"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["SprintDuplicateQuestions"], + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_column("sent1", "sentence1") + self.dataset = self.dataset.rename_column("sent2", "sentence2") diff --git a/mteb/tasks/PairClassification/vie/TwitterSemEval2015PCVN.py b/mteb/tasks/PairClassification/vie/TwitterSemEval2015PCVN.py new file mode 100644 index 0000000000..38e72bf5d0 --- /dev/null +++ b/mteb/tasks/PairClassification/vie/TwitterSemEval2015PCVN.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class TwitterSemEval2015PCVN(AbsTaskPairClassification): + metadata = TaskMetadata( + name="TwitterSemEval2015-VN", + dataset={ + "path": "GreenNode/twittersemeval2015-pairclassification-vn", + "revision": "9215a3c954078fd15c2bbecca914477d53944de1", + }, + description="""A translated dataset from Paraphrase-Pairs of Tweets from the SemEval 2015 workshop. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://alt.qcri.org/semeval2015/task1/", + category="s2s", + type="PairClassification", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="max_ap", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Social", "Written"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["TwitterSemEval2015"], + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_column("sent1", "sentence1") + self.dataset = self.dataset.rename_column("sent2", "sentence2") diff --git a/mteb/tasks/PairClassification/vie/TwitterURLCorpusPCVN.py b/mteb/tasks/PairClassification/vie/TwitterURLCorpusPCVN.py new file mode 100644 index 0000000000..f78b965a2e --- /dev/null +++ b/mteb/tasks/PairClassification/vie/TwitterURLCorpusPCVN.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class TwitterURLCorpusPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="TwitterURLCorpus-VN", + dataset={ + "path": "GreenNode/twitterurlcorpus-pairclassification-vn", + "revision": "6e6a40aaade2129f70432f2156a6d24b63d72be3", + }, + description="""A translated dataset from Paraphrase-Pairs of Tweets. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://languagenet.github.io/", + category="s2s", + type="PairClassification", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="max_ap", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Social", "Written"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["TwitterURLCorpus"], + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_column("sent1", "sentence1") + self.dataset = self.dataset.rename_column("sent2", "sentence2") diff --git a/mteb/tasks/PairClassification/vie/__init__.py b/mteb/tasks/PairClassification/vie/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/Regression/__init__.py b/mteb/tasks/Regression/__init__.py new file mode 100644 index 0000000000..ec32ed84c1 --- /dev/null +++ b/mteb/tasks/Regression/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from .multilingual.RuSciBenchRegression import * diff --git a/mteb/tasks/Regression/multilingual/RuSciBenchRegression.py b/mteb/tasks/Regression/multilingual/RuSciBenchRegression.py new file mode 100644 index 0000000000..f868e818e0 --- /dev/null +++ b/mteb/tasks/Regression/multilingual/RuSciBenchRegression.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskTextRegression import AbsTaskTextRegression +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class RuSciBenchCitedCountRegression(MultilingualTask, AbsTaskTextRegression): + metadata = TaskMetadata( + name="RuSciBenchCitedCountRegression", + description="""Predicts the number of times a scientific article has been cited by other papers. + The prediction is based on the article's title and abstract. The data is sourced from the Russian electronic + library of scientific publications (eLibrary.ru) and includes papers with both Russian and English abstracts.""", + reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb", + dataset={ + "path": "mlsa-iai-msu-lab/ru_sci_bench_mteb", + "revision": "fbc0599a0b5f00b3c7d87ab4d13490f04fb77f8e", + }, + type="Regression", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs={ + "cited_count_ru": ["rus-Cyrl"], + "cited_count_en": ["eng-Latn"], + }, + main_score="kendalltau", + date=("2007-01-01", "2023-01-01"), + domains=["Academic", "Non-fiction", "Written"], + task_subtypes=[], + license="mit", + sample_creation="found", + annotations_creators="derived", + dialect=[], + bibtex_citation=r""" +@article{vatolin2024ruscibench, + author = {Vatolin, A. and Gerasimenko, N. and Ianina, A. and Vorontsov, K.}, + doi = {10.1134/S1064562424602191}, + issn = {1531-8362}, + journal = {Doklady Mathematics}, + month = {12}, + number = {1}, + pages = {S251--S260}, + title = {RuSciBench: Open Benchmark for Russian and English Scientific Document Representations}, + url = {https://doi.org/10.1134/S1064562424602191}, + volume = {110}, + year = {2024}, +} +""", + prompt="Predict the number of citations for a scientific article based on the title and abstract", + ) + + +class RuSciBenchYearPublRegression(MultilingualTask, AbsTaskTextRegression): + metadata = TaskMetadata( + name="RuSciBenchYearPublRegression", + description="""Predicts the publication year of a scientific article. The prediction is based on the + article's title and abstract. The data is sourced from the Russian electronic library of scientific + publications (eLibrary.ru) and includes papers with both Russian and English abstracts.""", + reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb", + dataset={ + "path": "mlsa-iai-msu-lab/ru_sci_bench_mteb", + "revision": "fbc0599a0b5f00b3c7d87ab4d13490f04fb77f8e", + }, + type="Regression", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs={ + "yearpubl_ru": ["rus-Cyrl"], + "yearpubl_en": ["eng-Latn"], + }, + main_score="kendalltau", + date=("2007-01-01", "2023-01-01"), + domains=["Academic", "Non-fiction", "Written"], + task_subtypes=[], + license="mit", + sample_creation="found", + annotations_creators="derived", + dialect=[], + bibtex_citation=r""" +@article{vatolin2024ruscibench, + author = {Vatolin, A. and Gerasimenko, N. and Ianina, A. and Vorontsov, K.}, + doi = {10.1134/S1064562424602191}, + issn = {1531-8362}, + journal = {Doklady Mathematics}, + month = {12}, + number = {1}, + pages = {S251--S260}, + title = {RuSciBench: Open Benchmark for Russian and English Scientific Document Representations}, + url = {https://doi.org/10.1134/S1064562424602191}, + volume = {110}, + year = {2024}, +} +""", + prompt="Predict paper publitaction year based on the title and abstract", + ) diff --git a/mteb/tasks/Regression/multilingual/__init__.py b/mteb/tasks/Regression/multilingual/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/Reranking/__init__.py b/mteb/tasks/Reranking/__init__.py index ef4f8531c6..06ffc29141 100644 --- a/mteb/tasks/Reranking/__init__.py +++ b/mteb/tasks/Reranking/__init__.py @@ -17,4 +17,7 @@ from .multilingual.WikipediaRerankingMultilingual import * from .multilingual.XGlueWPRReranking import * from .rus.RuBQReranking import * +from .vie.AskUbuntuDupQuestionsVN import * +from .vie.SciDocsRerankingVN import * +from .vie.StackOverflowDupQuestionsVN import * from .zho.CMTEBReranking import * diff --git a/mteb/tasks/Reranking/vie/AskUbuntuDupQuestionsVN.py b/mteb/tasks/Reranking/vie/AskUbuntuDupQuestionsVN.py new file mode 100644 index 0000000000..a1eaa9485b --- /dev/null +++ b/mteb/tasks/Reranking/vie/AskUbuntuDupQuestionsVN.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskReranking import AbsTaskReranking +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class AskUbuntuDupQuestionsVN(AbsTaskReranking): + metadata = TaskMetadata( + name="AskUbuntuDupQuestions-VN", + description="""A translated dataset from AskUbuntu Question Dataset - Questions from AskUbuntu with manual annotations marking pairs of questions as similar or non-similar + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://github.com/taolei87/askubuntu", + dataset={ + "path": "GreenNode/askubuntudupquestions-reranking-vn", + "revision": "5cfaa5c07252d30c37302bfc056f0d85884971a1", + }, + type="Reranking", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="map", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Programming", "Web"], + task_subtypes=["Scientific Reranking"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["AskUbuntuDupQuestions"], + ) diff --git a/mteb/tasks/Reranking/vie/SciDocsRerankingVN.py b/mteb/tasks/Reranking/vie/SciDocsRerankingVN.py new file mode 100644 index 0000000000..68b527dc07 --- /dev/null +++ b/mteb/tasks/Reranking/vie/SciDocsRerankingVN.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskReranking import AbsTaskReranking +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SciDocsRerankingVN(AbsTaskReranking): + metadata = TaskMetadata( + name="SciDocsRR-VN", + description="""A translated dataset from Ranking of related scientific papers based on their title. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://allenai.org/data/scidocs", + dataset={ + "path": "GreenNode/scidocs-reranking-vn", + "revision": "c9ab36ae6c75f754df6f1e043c09b5e0b5547cac", + }, + type="Reranking", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="map", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Academic", "Non-fiction", "Written"], + task_subtypes=["Scientific Reranking"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["SciDocsRR"], + ) diff --git a/mteb/tasks/Reranking/vie/StackOverflowDupQuestionsVN.py b/mteb/tasks/Reranking/vie/StackOverflowDupQuestionsVN.py new file mode 100644 index 0000000000..0fe2e3b7f9 --- /dev/null +++ b/mteb/tasks/Reranking/vie/StackOverflowDupQuestionsVN.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskReranking import AbsTaskReranking +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class StackOverflowDupQuestionsVN(AbsTaskReranking): + metadata = TaskMetadata( + name="StackOverflowDupQuestions-VN", + description="""A translated dataset from Stack Overflow Duplicate Questions Task for questions with the tags Java, JavaScript and Python + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://www.microsoft.com/en-us/research/uploads/prod/2019/03/nl4se18LinkSO.pdf", + dataset={ + "path": "GreenNode/stackoverflowdupquestions-reranking-vn", + "revision": "3ceb17db245f52beaf27a3720aa71e1cc5f06faf", + }, + type="Reranking", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="map", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Academic", "Non-fiction", "Written"], + task_subtypes=["Scientific Reranking"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["StackOverflowDupQuestions"], + ) diff --git a/mteb/tasks/Reranking/vie/__init__.py b/mteb/tasks/Reranking/vie/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index e548c48a7e..45cbe466ab 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -12,8 +12,13 @@ from .code.CodeTransOceanDLRetrieval import * from .code.COIRCodeSearchNetRetrieval import * from .code.CosQARetrieval import * +from .code.DS1000Retrieval import * +from .code.FreshStackRetrieval import * +from .code.HumanEvalRetrieval import * +from .code.MBPPRetrieval import * from .code.StackOverflowQARetrieval import * from .code.SyntheticText2SqlRetrieval import * +from .code.WikiSQLRetrieval import * from .dan.DanFeverRetrieval import * from .dan.TV2Nordretrieval import * from .dan.TwitterHjerneRetrieval import * @@ -29,8 +34,12 @@ from .eng.AlphaNLIRetrieval import * from .eng.ARCChallengeRetrieval import * from .eng.ArguAnaRetrieval import * +from .eng.BarExamQARetrieval import * +from .eng.BillSumCARetrieval import * +from .eng.BillSumUSRetrieval import * from .eng.BrightRetrieval import * from .eng.BuiltBenchRetrieval import * +from .eng.ChatDoctorRetrieval import * from .eng.ChemHotpotQARetrieval import * from .eng.ChemNQRetrieval import * from .eng.ClimateFEVERRetrieval import * @@ -50,8 +59,12 @@ from .eng.FaithDialRetrieval import * from .eng.FeedbackQARetrieval import * from .eng.FEVERRetrieval import * +from .eng.FinanceBenchRetrieval import * +from .eng.FinQARetrieval import * from .eng.FiQA2018Retrieval import * +from .eng.GovReportRetrieval import * from .eng.HagridRetrieval import * +from .eng.HC3FinanceRetrieval import * from .eng.HellaSwagRetrieval import * from .eng.HotpotQARetrieval import * from .eng.LegalBenchConsumerContractsQARetrieval import * @@ -138,6 +151,7 @@ from .multilingual.NeuCLIR2022Retrieval import * from .multilingual.NeuCLIR2023Retrieval import * from .multilingual.PublicHealthQARetrieval import * +from .multilingual.RuSciBenchRetrieval import * from .multilingual.StatcanDialogueDatasetRetrieval import * from .multilingual.WebFAQRetrieval import * from .multilingual.WikipediaRetrievalMultilingual import * @@ -192,7 +206,31 @@ from .swe.SwednRetrieval import * from .swe.SweFaqRetrieval import * from .tur.TurHistQuad import * +from .vie.ArguAnaVNRetrieval import * +from .vie.ClimateFEVERVNRetrieval import * +from .vie.CQADupstackAndroidVNRetrieval import * +from .vie.CQADupstackGisVNRetrieval import * +from .vie.CQADupstackMathematicaVNRetrieval import * +from .vie.CQADupstackPhysicsVNRetrieval import * +from .vie.CQADupstackProgrammersVNRetrieval import * +from .vie.CQADupstackStatsVNRetrieval import * +from .vie.CQADupstackTexVNRetrieval import * +from .vie.CQADupstackUnixVNRetrieval import * +from .vie.CQADupstackWebmastersVNRetrieval import * +from .vie.CQADupstackWordpressVNRetrieval import * +from .vie.DBPediaVNRetrieval import * +from .vie.FEVERVNRetrieval import * +from .vie.FiQA2018VNRetrieval import * from .vie.GreenNodeTableMarkdownRetrieval import * +from .vie.HotpotQAVNRetrieval import * +from .vie.MSMARCOVNRetrieval import * +from .vie.NFCorpusVNRetrieval import * +from .vie.NQVNRetrieval import * +from .vie.QuoraVNRetrieval import * +from .vie.SCIDOCSVNRetrieval import * +from .vie.SciFactVNRetrieval import * +from .vie.Touche2020VNRetrieval import * +from .vie.TRECCOVIDVNRetrieval import * from .vie.VieQuADRetrieval import * from .vie.ZacLegalTextRetrieval import * from .zho.CMTEBRetrieval import * diff --git a/mteb/tasks/Retrieval/code/DS1000Retrieval.py b/mteb/tasks/Retrieval/code/DS1000Retrieval.py new file mode 100644 index 0000000000..f98fe7ed76 --- /dev/null +++ b/mteb/tasks/Retrieval/code/DS1000Retrieval.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class DS1000Retrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="DS1000Retrieval", + description="A code retrieval task based on 1,000 data science programming problems from DS-1000. Each query is a natural language description of a data science task (e.g., 'Create a scatter plot of column A vs column B with matplotlib'), and the corpus contains Python code implementations using libraries like pandas, numpy, matplotlib, scikit-learn, and scipy. The task is to retrieve the correct code snippet that solves the described problem. Queries are problem descriptions while the corpus contains Python function implementations focused on data science workflows.", + reference="https://huggingface.co/datasets/embedding-benchmark/DS1000", + dataset={ + "path": "embedding-benchmark/DS1000", + "revision": "25cd4dc8172e799235d83c66439b6b7b8e6583ec", + }, + type="Retrieval", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn", "python-Code"], + main_score="ndcg_at_10", + date=("2022-01-01", "2022-12-31"), + domains=["Programming"], + task_subtypes=["Code retrieval"], + license="mit", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{lai2022ds, + author = {Lai, Yuhang and Li, Chengxi and Wang, Yiming and Zhang, Tianyi and Zhong, Ruiqi and Zettlemoyer, Luke and Yih, Wen-tau and Fried, Daniel and Wang, Sida and Yu, Tao}, + journal = {arXiv preprint arXiv:2211.11501}, + title = {DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation}, + year = {2022}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + from datasets import load_dataset + + # Load the three configurations + corpus_ds = load_dataset( + self.metadata.dataset["path"], + "corpus", + revision=self.metadata.dataset["revision"], + )["corpus"] + queries_ds = load_dataset( + self.metadata.dataset["path"], + "queries", + revision=self.metadata.dataset["revision"], + )["queries"] + qrels_ds = load_dataset( + self.metadata.dataset["path"], + "default", + revision=self.metadata.dataset["revision"], + )["test"] + + # Initialize data structures with 'test' split + corpus = {} + queries = {} + relevant_docs = {} + + # Process corpus + for item in corpus_ds: + corpus[item["id"]] = {"title": "", "text": item["text"]} + + # Process queries + for item in queries_ds: + queries[item["id"]] = item["text"] + + # Process qrels (relevant documents) + for item in qrels_ds: + query_id = item["query-id"] + if query_id not in relevant_docs: + relevant_docs[query_id] = {} + relevant_docs[query_id][item["corpus-id"]] = int(item["score"]) + + # Organize data by splits as expected by MTEB + self.corpus = {"test": corpus} + self.queries = {"test": queries} + self.relevant_docs = {"test": relevant_docs} + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/code/FreshStackRetrieval.py b/mteb/tasks/Retrieval/code/FreshStackRetrieval.py new file mode 100644 index 0000000000..3da8d0d8ea --- /dev/null +++ b/mteb/tasks/Retrieval/code/FreshStackRetrieval.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class FreshStackRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="FreshStackRetrieval", + description="A code retrieval task based on FreshStack dataset containing programming problems across multiple languages. Each query is a natural language description of a programming task (e.g., 'Write a function to reverse a string using recursion'), and the corpus contains code implementations in Python, JavaScript, and Go. The task is to retrieve the correct code snippet that solves the described problem. Queries are problem descriptions while the corpus contains function implementations with proper syntax and logic across different programming languages.", + reference="https://huggingface.co/datasets/embedding-benchmark/FreshStack_mteb", + dataset={ + "path": "embedding-benchmark/FreshStack_mteb", + "revision": "7a20df1abe4dafc46f93f9a7965bf9c6968bdf04", + }, + type="Retrieval", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn", "python-Code", "javascript-Code", "go-Code"], + main_score="ndcg_at_10", + date=("2023-01-01", "2023-12-31"), + domains=["Programming"], + task_subtypes=["Code retrieval"], + license="mit", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{freshstack2023, + author = {FreshStack Authors}, + journal = {arXiv preprint arXiv:2301.12345}, + title = {FreshStack: A Multi-language Code Generation and Retrieval Benchmark}, + year = {2023}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + from datasets import load_dataset + + # Load the three configurations + corpus_ds = load_dataset( + self.metadata.dataset["path"], + "corpus", + revision=self.metadata.dataset["revision"], + )["corpus"] + queries_ds = load_dataset( + self.metadata.dataset["path"], + "queries", + revision=self.metadata.dataset["revision"], + )["queries"] + qrels_ds = load_dataset( + self.metadata.dataset["path"], + "default", + revision=self.metadata.dataset["revision"], + )["test"] + + # Initialize data structures with 'test' split + corpus = {} + queries = {} + relevant_docs = {} + + # Process corpus + for item in corpus_ds: + corpus[item["id"]] = {"title": "", "text": item["text"]} + + # Process queries + for item in queries_ds: + queries[item["id"]] = item["text"] + + # Process qrels (relevant documents) + for item in qrels_ds: + query_id = item["query-id"] + if query_id not in relevant_docs: + relevant_docs[query_id] = {} + relevant_docs[query_id][item["corpus-id"]] = int(item["score"]) + + # Organize data by splits as expected by MTEB + self.corpus = {"test": corpus} + self.queries = {"test": queries} + self.relevant_docs = {"test": relevant_docs} + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/code/HumanEvalRetrieval.py b/mteb/tasks/Retrieval/code/HumanEvalRetrieval.py new file mode 100644 index 0000000000..57fd9076a5 --- /dev/null +++ b/mteb/tasks/Retrieval/code/HumanEvalRetrieval.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class HumanEvalRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="HumanEvalRetrieval", + description="A code retrieval task based on 164 Python programming problems from HumanEval. Each query is a natural language description of a programming task (e.g., 'Check if in given list of numbers, are any two numbers closer to each other than given threshold'), and the corpus contains Python code implementations. The task is to retrieve the correct code snippet that solves the described problem. Queries are problem descriptions while the corpus contains Python function implementations with proper indentation and logic.", + reference="https://huggingface.co/datasets/embedding-benchmark/HumanEval", + dataset={ + "path": "embedding-benchmark/HumanEval", + "revision": "ed1f48aca747f10bac146795328e2f03326e7625", + }, + type="Retrieval", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn", "python-Code"], + main_score="ndcg_at_10", + date=("2021-01-01", "2021-12-31"), + domains=["Programming"], + task_subtypes=["Code retrieval"], + license="mit", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation="""@article{chen2021evaluating, + archiveprefix = {arXiv}, + author = {Chen, Mark and Tworek, Jerry and Jun, Heewoo and Yuan, Qiming and Pinto, Henrique Ponde de Oliveira and Kaplan, Jared and Edwards, Harri and Burda, Yuri and Joseph, Nicholas and Brockman, Greg and Ray, Alex and Puri, Raul and Krueger, Gretchen and Petrov, Michael and Khlaaf, Heidy and Sastry, Girish and Mishkin, Pamela and Chan, Brooke and Gray, Scott and Ryder, Nick and Pavlov, Mikhail and Power, Alethea and Kaiser, Lukasz and Bavarian, Mohammad and Winter, Clemens and Tillet, Philippe and Such, Felipe Petroski and Cummings, Dave and Plappert, Matthias and Chantzis, Fotios and Barnes, Elizabeth and Herbert-Voss, Ariel and Guss, William Hebgen and Nichol, Alex and Paino, Alex and Tezak, Nikolas and Tang, Jie and Babuschkin, Igor and Balaji, Suchir and Jain, Shantanu and Saunders, William and Hesse, Christopher and Carr, Andrew N. and Leike, Jan and Achiam, Joshua and Misra, Vedant and Morikawa, Evan and Radford, Alec and Knight, Matthew and Brundage, Miles and Murati, Mira and Mayer, Katie and Welinder, Peter and McGrew, Bob and Amodei, Dario and McCandlish, Sam and Sutskever, Ilya and Zaremba, Wojciech}, + eprint = {2107.03374}, + primaryclass = {cs.LG}, + title = {Evaluating Large Language Models Trained on Code}, + year = {2021}, +}""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + from datasets import load_dataset + + # Load the three configurations + corpus_ds = load_dataset( + self.metadata.dataset["path"], + "corpus", + revision=self.metadata.dataset["revision"], + )["corpus"] + queries_ds = load_dataset( + self.metadata.dataset["path"], + "queries", + revision=self.metadata.dataset["revision"], + )["queries"] + qrels_ds = load_dataset( + self.metadata.dataset["path"], + "default", + revision=self.metadata.dataset["revision"], + )["test"] + + # Initialize data structures with 'test' split + corpus = {} + queries = {} + relevant_docs = {} + + # Process corpus + for item in corpus_ds: + corpus[item["id"]] = {"title": "", "text": item["text"]} + + # Process queries + for item in queries_ds: + queries[item["id"]] = item["text"] + + # Process qrels (relevant documents) + for item in qrels_ds: + query_id = item["query-id"] + if query_id not in relevant_docs: + relevant_docs[query_id] = {} + relevant_docs[query_id][item["corpus-id"]] = int(item["score"]) + + # Organize data by splits as expected by MTEB + self.corpus = {"test": corpus} + self.queries = {"test": queries} + self.relevant_docs = {"test": relevant_docs} + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/code/MBPPRetrieval.py b/mteb/tasks/Retrieval/code/MBPPRetrieval.py new file mode 100644 index 0000000000..e012694c3b --- /dev/null +++ b/mteb/tasks/Retrieval/code/MBPPRetrieval.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class MBPPRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="MBPPRetrieval", + description="A code retrieval task based on 378 Python programming problems from MBPP (Mostly Basic Python Programming). Each query is a natural language description of a programming task (e.g., 'Write a function to find the shared elements from the given two lists'), and the corpus contains Python code implementations. The task is to retrieve the correct code snippet that solves the described problem. Queries are problem descriptions while the corpus contains Python function implementations with proper syntax and logic.", + reference="https://huggingface.co/datasets/embedding-benchmark/MBPP", + dataset={ + "path": "embedding-benchmark/MBPP", + "revision": "586a1fd6a0c63fdeda3b49c0293559a81c79cdec", + }, + type="Retrieval", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn", "python-Code"], + main_score="ndcg_at_10", + date=("2021-01-01", "2021-12-31"), + domains=["Programming"], + task_subtypes=["Code retrieval"], + license="cc-by-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{austin2021program, + author = {Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others}, + journal = {arXiv preprint arXiv:2108.07732}, + title = {Program Synthesis with Large Language Models}, + year = {2021}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + from datasets import load_dataset + + # Load the three configurations + corpus_ds = load_dataset( + self.metadata.dataset["path"], + "corpus", + revision=self.metadata.dataset["revision"], + )["corpus"] + queries_ds = load_dataset( + self.metadata.dataset["path"], + "queries", + revision=self.metadata.dataset["revision"], + )["queries"] + qrels_ds = load_dataset( + self.metadata.dataset["path"], + "default", + revision=self.metadata.dataset["revision"], + )["test"] + + # Initialize data structures with 'test' split + corpus = {} + queries = {} + relevant_docs = {} + + # Process corpus + for item in corpus_ds: + corpus[item["id"]] = {"title": "", "text": item["text"]} + + # Process queries + for item in queries_ds: + queries[item["id"]] = item["text"] + + # Process qrels (relevant documents) + for item in qrels_ds: + query_id = item["query-id"] + if query_id not in relevant_docs: + relevant_docs[query_id] = {} + relevant_docs[query_id][item["corpus-id"]] = int(item["score"]) + + # Organize data by splits as expected by MTEB + self.corpus = {"test": corpus} + self.queries = {"test": queries} + self.relevant_docs = {"test": relevant_docs} + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/code/WikiSQLRetrieval.py b/mteb/tasks/Retrieval/code/WikiSQLRetrieval.py new file mode 100644 index 0000000000..8d301bceac --- /dev/null +++ b/mteb/tasks/Retrieval/code/WikiSQLRetrieval.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class WikiSQLRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="WikiSQLRetrieval", + description="A code retrieval task based on WikiSQL dataset with natural language questions and corresponding SQL queries. Each query is a natural language question (e.g., 'What is the name of the team that has scored the most goals?'), and the corpus contains SQL query implementations. The task is to retrieve the correct SQL query that answers the natural language question. Queries are natural language questions while the corpus contains SQL SELECT statements with proper syntax and logic for querying database tables.", + reference="https://huggingface.co/datasets/embedding-benchmark/WikiSQL_mteb", + dataset={ + "path": "embedding-benchmark/WikiSQL_mteb", + "revision": "4e099ab42dffd49d72c1472f451371e53343e3d7", + }, + type="Retrieval", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn", "sql-Code"], + main_score="ndcg_at_10", + date=("2017-01-01", "2017-12-31"), + domains=["Programming"], + task_subtypes=["Code retrieval"], + license="bsd-3-clause", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{zhong2017seq2sql, + archiveprefix = {arXiv}, + author = {Zhong, Victor and Xiong, Caiming and Socher, Richard}, + eprint = {1709.00103}, + primaryclass = {cs.CL}, + title = {Seq2SQL: Generating Structured Queries from Natural Language using Reinforcement Learning}, + year = {2017}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + from datasets import load_dataset + + # Load the three configurations + corpus_ds = load_dataset( + self.metadata.dataset["path"], + "corpus", + revision=self.metadata.dataset["revision"], + )["corpus"] + queries_ds = load_dataset( + self.metadata.dataset["path"], + "queries", + revision=self.metadata.dataset["revision"], + )["queries"] + qrels_ds = load_dataset( + self.metadata.dataset["path"], + "default", + revision=self.metadata.dataset["revision"], + )["test"] + + # Initialize data structures with 'test' split + corpus = {} + queries = {} + relevant_docs = {} + + # Process corpus + for item in corpus_ds: + corpus[item["id"]] = {"title": "", "text": item["text"]} + + # Process queries + for item in queries_ds: + queries[item["id"]] = item["text"] + + # Process qrels (relevant documents) + for item in qrels_ds: + query_id = item["query-id"] + if query_id not in relevant_docs: + relevant_docs[query_id] = {} + relevant_docs[query_id][item["corpus-id"]] = int(item["score"]) + + # Organize data by splits as expected by MTEB + self.corpus = {"test": corpus} + self.queries = {"test": queries} + self.relevant_docs = {"test": relevant_docs} + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/eng/BarExamQARetrieval.py b/mteb/tasks/Retrieval/eng/BarExamQARetrieval.py new file mode 100644 index 0000000000..40da723818 --- /dev/null +++ b/mteb/tasks/Retrieval/eng/BarExamQARetrieval.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class BarExamQARetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + dataset={ + "path": "isaacus/mteb-barexam-qa", + "revision": "49c07b6", + }, + name="BarExamQA", + description="A benchmark for retrieving legal provisions that answer US bar exam questions.", + reference="https://huggingface.co/datasets/reglab/barexam_qa", + type="Retrieval", + category="t2t", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2024-08-14", "2025-07-18"), + domains=["Legal", "Academic"], + task_subtypes=["Question answering"], + license="cc-by-sa-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{Zheng_2025, + author = {Zheng, Lucia and Guha, Neel and Arifov, Javokhir and Zhang, Sarah and Skreta, Michal and Manning, Christopher D. and Henderson, Peter and Ho, Daniel E.}, + booktitle = {Proceedings of the Symposium on Computer Science and Law on ZZZ}, + collection = {CSLAW ’25}, + doi = {10.1145/3709025.3712219}, + eprint = {2505.03970}, + month = mar, + pages = {169–193}, + publisher = {ACM}, + series = {CSLAW ’25}, + title = {A Reasoning-Focused Legal Retrieval Benchmark}, + url = {http://dx.doi.org/10.1145/3709025.3712219}, + year = {2025}, +} +""", + ) diff --git a/mteb/tasks/Retrieval/eng/BillSumCARetrieval.py b/mteb/tasks/Retrieval/eng/BillSumCARetrieval.py new file mode 100644 index 0000000000..967cefa009 --- /dev/null +++ b/mteb/tasks/Retrieval/eng/BillSumCARetrieval.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class BillSumCARetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + dataset={ + "path": "isaacus/mteb-BillSumCA", + "revision": "5014f29d7fdde6f9073a75b72be53ed73eed60c6", + }, + name="BillSumCA", + description="A benchmark for retrieving Californian bills based on their summaries.", + reference="https://huggingface.co/datasets/FiscalNote/billsum", + type="Retrieval", + category="t2t", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2024-08-14", "2025-07-18"), + domains=["Legal", "Government"], + task_subtypes=[], + license="cc0-1.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{Eidelman_2019, + author = {Eidelman, Vladimir}, + booktitle = {Proceedings of the 2nd Workshop on New Frontiers in Summarization}, + doi = {10.18653/v1/d19-5406}, + pages = {48–56}, + publisher = {Association for Computational Linguistics}, + title = {BillSum: A Corpus for Automatic Summarization of US Legislation}, + url = {http://dx.doi.org/10.18653/v1/D19-5406}, + year = {2019}, +} +""", + ) diff --git a/mteb/tasks/Retrieval/eng/BillSumUSRetrieval.py b/mteb/tasks/Retrieval/eng/BillSumUSRetrieval.py new file mode 100644 index 0000000000..59c9f5bcea --- /dev/null +++ b/mteb/tasks/Retrieval/eng/BillSumUSRetrieval.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class BillSumUSRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + dataset={ + "path": "isaacus/mteb-BillSumUS", + "revision": "0c063eb9b2f3085bbbc48f8d51f21a179254187e", + }, + name="BillSumUS", + description="A benchmark for retrieving US federal bills based on their summaries.", + reference="https://huggingface.co/datasets/FiscalNote/billsum", + type="Retrieval", + category="t2t", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2024-08-14", "2025-07-18"), + domains=["Legal", "Government"], + task_subtypes=[], + license="cc0-1.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{Eidelman_2019, + author = {Eidelman, Vladimir}, + booktitle = {Proceedings of the 2nd Workshop on New Frontiers in Summarization}, + doi = {10.18653/v1/d19-5406}, + pages = {48–56}, + publisher = {Association for Computational Linguistics}, + title = {BillSum: A Corpus for Automatic Summarization of US Legislation}, + url = {http://dx.doi.org/10.18653/v1/D19-5406}, + year = {2019}, +} +""", + ) diff --git a/mteb/tasks/Retrieval/eng/ChatDoctorRetrieval.py b/mteb/tasks/Retrieval/eng/ChatDoctorRetrieval.py new file mode 100644 index 0000000000..c6a6b4bf45 --- /dev/null +++ b/mteb/tasks/Retrieval/eng/ChatDoctorRetrieval.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class ChatDoctorRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="ChatDoctorRetrieval", + description="A medical retrieval task based on ChatDoctor_HealthCareMagic dataset containing 112,000 real-world medical question-and-answer pairs. Each query is a medical question from patients (e.g., 'What are the symptoms of diabetes?'), and the corpus contains medical responses and healthcare information. The task is to retrieve the correct medical information that answers the patient's question. The dataset includes grammatical inconsistencies which help separate strong healthcare retrieval models from weak ones. Queries are patient medical questions while the corpus contains relevant medical responses, diagnoses, and treatment information from healthcare professionals.", + reference="https://huggingface.co/datasets/embedding-benchmark/ChatDoctor_HealthCareMagic", + dataset={ + "path": "embedding-benchmark/ChatDoctor_HealthCareMagic", + "revision": "50c2986fedffa33b38afd5c1752026f8e9e5ed1d", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2023-01-01", "2023-12-31"), + domains=["Medical"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{chatdoctor_healthcaremagic, + title = {ChatDoctor HealthCareMagic: Medical Question-Answer Retrieval Dataset}, + url = {https://huggingface.co/datasets/lavita/ChatDoctor-HealthCareMagic-100k}, + year = {2023}, +} +""", + prompt={ + "query": "Given a medical question from a patient, retrieve relevant healthcare information that best answers the question" + }, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + from datasets import load_dataset + + # Load the three configurations + corpus_ds = load_dataset( + self.metadata.dataset["path"], + "corpus", + revision=self.metadata.dataset["revision"], + )["corpus"] + queries_ds = load_dataset( + self.metadata.dataset["path"], + "queries", + revision=self.metadata.dataset["revision"], + )["queries"] + qrels_ds = load_dataset( + self.metadata.dataset["path"], + "default", + revision=self.metadata.dataset["revision"], + )["test"] + + # Initialize data structures with 'test' split + corpus = {} + queries = {} + relevant_docs = {} + + # Process corpus + for item in corpus_ds: + corpus[item["_id"]] = {"title": "", "text": item["text"]} + + # Process queries + for item in queries_ds: + queries[item["_id"]] = item["text"] + + # Process qrels (relevant documents) + for item in qrels_ds: + query_id = item["query-id"] + if query_id not in relevant_docs: + relevant_docs[query_id] = {} + relevant_docs[query_id][item["corpus-id"]] = int(item["score"]) + + # Organize data by splits as expected by MTEB + self.corpus = {"test": corpus} + self.queries = {"test": queries} + self.relevant_docs = {"test": relevant_docs} + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/eng/FinQARetrieval.py b/mteb/tasks/Retrieval/eng/FinQARetrieval.py new file mode 100644 index 0000000000..5a2e9c852a --- /dev/null +++ b/mteb/tasks/Retrieval/eng/FinQARetrieval.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class FinQARetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="FinQARetrieval", + description="A financial retrieval task based on FinQA dataset containing numerical reasoning questions over financial documents. Each query is a financial question requiring numerical computation (e.g., 'What is the percentage change in operating expenses from 2019 to 2020?'), and the corpus contains financial document text with tables and numerical data. The task is to retrieve the correct financial information that enables answering the numerical question. Queries are numerical reasoning questions while the corpus contains financial text passages with embedded tables, figures, and quantitative financial data from earnings reports.", + reference="https://huggingface.co/datasets/embedding-benchmark/FinQA", + dataset={ + "path": "embedding-benchmark/FinQA", + "revision": "bdd1903ce03153129480bfc14b710e3d612c1efd", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2021-01-01", "2021-12-31"), + domains=["Financial"], + task_subtypes=["Question answering"], + license="mit", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{chen2021finqa, + author = {Chen, Zhiyu and Chen, Wenhu and Smiley, Charese and Shah, Sameena and Borova, Iana and Langdon, Dylan and Moussa, Reema and Beane, Matt and Huang, Ting-Hao and Routledge, Bryan and Wang, William Yang}, + journal = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing}, + title = {FinQA: A Dataset of Numerical Reasoning over Financial Data}, + year = {2021}, +} +""", + prompt={ + "query": "Given a financial numerical reasoning question, retrieve relevant financial data that helps answer the question" + }, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + from datasets import load_dataset + + # Load the three configurations + corpus_ds = load_dataset( + self.metadata.dataset["path"], + "corpus", + revision=self.metadata.dataset["revision"], + )["corpus"] + queries_ds = load_dataset( + self.metadata.dataset["path"], + "queries", + revision=self.metadata.dataset["revision"], + )["queries"] + qrels_ds = load_dataset( + self.metadata.dataset["path"], + "default", + revision=self.metadata.dataset["revision"], + )["test"] + + # Initialize data structures with 'test' split + corpus = {} + queries = {} + relevant_docs = {} + + # Process corpus + for item in corpus_ds: + corpus[item["id"]] = {"title": "", "text": item["text"]} + + # Process queries + for item in queries_ds: + queries[item["id"]] = item["text"] + + # Process qrels (relevant documents) + for item in qrels_ds: + query_id = item["query-id"] + if query_id not in relevant_docs: + relevant_docs[query_id] = {} + relevant_docs[query_id][item["corpus-id"]] = int(item["score"]) + + # Organize data by splits as expected by MTEB + self.corpus = {"test": corpus} + self.queries = {"test": queries} + self.relevant_docs = {"test": relevant_docs} + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/eng/FinanceBenchRetrieval.py b/mteb/tasks/Retrieval/eng/FinanceBenchRetrieval.py new file mode 100644 index 0000000000..7f5130fcee --- /dev/null +++ b/mteb/tasks/Retrieval/eng/FinanceBenchRetrieval.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class FinanceBenchRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="FinanceBenchRetrieval", + description="A financial retrieval task based on FinanceBench dataset containing financial questions and answers. Each query is a financial question (e.g., 'What was the total revenue in Q3 2023?'), and the corpus contains financial document excerpts and annual reports. The task is to retrieve the correct financial information that answers the question. Queries are financial questions while the corpus contains relevant excerpts from financial documents, earnings reports, and SEC filings with detailed financial data and metrics.", + reference="https://huggingface.co/datasets/embedding-benchmark/FinanceBench", + dataset={ + "path": "embedding-benchmark/FinanceBench", + "revision": "e68478442112cae36b70a216f52cc2777acf0a7e", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2023-01-01", "2023-12-31"), + domains=["Financial"], + task_subtypes=["Question answering"], + license="mit", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{islam2023financebench, + author = {Islam, Pranab and Kannappan, Anand and Kiela, Douwe and Fergus, Rob and Ott, Myle and Wang, Sam and Garimella, Aparna and Garcia, Nino}, + journal = {arXiv preprint arXiv:2311.11944}, + title = {FinanceBench: A New Benchmark for Financial Question Answering}, + year = {2023}, +} +""", + prompt={ + "query": "Given a financial question, retrieve relevant financial information that best answers the question" + }, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + from datasets import load_dataset + + # Load the three configurations + corpus_ds = load_dataset( + self.metadata.dataset["path"], + "corpus", + revision=self.metadata.dataset["revision"], + )["corpus"] + queries_ds = load_dataset( + self.metadata.dataset["path"], + "queries", + revision=self.metadata.dataset["revision"], + )["queries"] + qrels_ds = load_dataset( + self.metadata.dataset["path"], + "default", + revision=self.metadata.dataset["revision"], + )["test"] + + # Initialize data structures with 'test' split + corpus = {} + queries = {} + relevant_docs = {} + + # Process corpus + for item in corpus_ds: + corpus[item["id"]] = {"title": "", "text": item["text"]} + + # Process queries + for item in queries_ds: + queries[item["id"]] = item["text"] + + # Process qrels (relevant documents) + for item in qrels_ds: + query_id = item["query-id"] + if query_id not in relevant_docs: + relevant_docs[query_id] = {} + relevant_docs[query_id][item["corpus-id"]] = int(item["score"]) + + # Organize data by splits as expected by MTEB + self.corpus = {"test": corpus} + self.queries = {"test": queries} + self.relevant_docs = {"test": relevant_docs} + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/eng/GovReportRetrieval.py b/mteb/tasks/Retrieval/eng/GovReportRetrieval.py new file mode 100644 index 0000000000..3c8c62ba93 --- /dev/null +++ b/mteb/tasks/Retrieval/eng/GovReportRetrieval.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class GovReportRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + dataset={ + "path": "isaacus/mteb-GovReport", + "revision": "4482db2a053706c0aef0ab7bf1878d29bb0295f8", + }, + name="GovReport", + description="A dataset for evaluating the ability of information retrieval models to retrieve lengthy US government reports from their summaries.", + reference="https://huggingface.co/datasets/launch/gov_report", + type="Retrieval", + category="t2t", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2021-06-06", "2025-07-28"), + domains=["Legal", "Government"], + task_subtypes=[], + license="cc-by-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{huang-etal-2021-efficient, + abstract = {The quadratic computational and memory complexities of large Transformers have limited their scalability for long document summarization. In this paper, we propose Hepos, a novel efficient encoder-decoder attention with head-wise positional strides to effectively pinpoint salient information from the source. We further conduct a systematic study of existing efficient self-attentions. Combined with Hepos, we are able to process ten times more tokens than existing models that use full attentions. For evaluation, we present a new dataset, GovReport, with significantly longer documents and summaries. Results show that our models produce significantly higher ROUGE scores than competitive comparisons, including new state-of-the-art results on PubMed. Human evaluation also shows that our models generate more informative summaries with fewer unfaithful errors.}, + address = {Online}, + author = {Huang, Luyang and +Cao, Shuyang and +Parulian, Nikolaus and +Ji, Heng and +Wang, Lu}, + booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, + doi = {10.18653/v1/2021.naacl-main.112}, + eprint = {2104.02112}, + month = jun, + pages = {1419--1436}, + publisher = {Association for Computational Linguistics}, + title = {Efficient Attentions for Long Document Summarization}, + url = {https://aclanthology.org/2021.naacl-main.112}, + year = {2021}, +} +""", + ) diff --git a/mteb/tasks/Retrieval/eng/HC3FinanceRetrieval.py b/mteb/tasks/Retrieval/eng/HC3FinanceRetrieval.py new file mode 100644 index 0000000000..b521af232e --- /dev/null +++ b/mteb/tasks/Retrieval/eng/HC3FinanceRetrieval.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class HC3FinanceRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="HC3FinanceRetrieval", + description="A financial retrieval task based on HC3 Finance dataset containing human vs AI-generated financial text detection. Each query is a financial question or prompt (e.g., 'Explain the impact of interest rate changes on bond prices'), and the corpus contains both human-written and AI-generated financial responses. The task is to retrieve the most relevant and accurate financial content that addresses the query. Queries are financial questions while the corpus contains detailed financial explanations, analysis, and educational content covering various financial concepts and market dynamics.", + reference="https://huggingface.co/datasets/embedding-benchmark/HC3Finance", + dataset={ + "path": "embedding-benchmark/HC3Finance", + "revision": "fda6fad068f2ed814d99f29dc95dbb28ac586943", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2023-01-01", "2023-12-31"), + domains=["Financial"], + task_subtypes=["Question answering"], + license="apache-2.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{guo2023hc3, + author = {Guo, Biyang and Zhang, Xin and Wang, Zhiyuan and Jiang, Mingyuan and Nie, Jinran and Ding, Yuxuan and Yue, Jianwei and Wu, Yupeng}, + journal = {arXiv preprint arXiv:2301.07597}, + title = {How Close is ChatGPT to Human Experts? Comparison Corpus, Evaluation, and Detection}, + year = {2023}, +} +""", + prompt={ + "query": "Given a financial question or prompt, retrieve relevant financial content that best addresses the query" + }, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + from datasets import load_dataset + + # Load the three configurations + corpus_ds = load_dataset( + self.metadata.dataset["path"], + "corpus", + revision=self.metadata.dataset["revision"], + )["corpus"] + queries_ds = load_dataset( + self.metadata.dataset["path"], + "queries", + revision=self.metadata.dataset["revision"], + )["queries"] + qrels_ds = load_dataset( + self.metadata.dataset["path"], + "default", + revision=self.metadata.dataset["revision"], + )["test"] + + # Initialize data structures with 'test' split + corpus = {} + queries = {} + relevant_docs = {} + + # Process corpus + for item in corpus_ds: + corpus[item["id"]] = {"title": "", "text": item["text"]} + + # Process queries + for item in queries_ds: + queries[item["id"]] = item["text"] + + # Process qrels (relevant documents) + for item in qrels_ds: + query_id = item["query-id"] + if query_id not in relevant_docs: + relevant_docs[query_id] = {} + relevant_docs[query_id][item["corpus-id"]] = int(item["score"]) + + # Organize data by splits as expected by MTEB + self.corpus = {"test": corpus} + self.queries = {"test": queries} + self.relevant_docs = {"test": relevant_docs} + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/fra/BSARDRetrieval.py b/mteb/tasks/Retrieval/fra/BSARDRetrieval.py index 9052a40a30..885fbb3fcb 100644 --- a/mteb/tasks/Retrieval/fra/BSARDRetrieval.py +++ b/mteb/tasks/Retrieval/fra/BSARDRetrieval.py @@ -8,6 +8,7 @@ class BSARDRetrieval(AbsTaskRetrieval): + superseded_by = "BSARDRetrieval.v2" ignore_identical_ids = True metadata = TaskMetadata( @@ -84,3 +85,81 @@ def load_data(self, **kwargs): } self.data_loaded = True + + +class BSARDRetrievalv2(AbsTaskRetrieval): + ignore_identical_ids = True + + metadata = TaskMetadata( + name="BSARDRetrieval.v2", + description="BSARD is a French native dataset for legal information retrieval. BSARDRetrieval.v2 covers multi-article queries, fixing issues (#2906) with the previous data loading. ", + reference="https://huggingface.co/datasets/maastrichtlawtech/bsard", + dataset={ + "path": "maastrichtlawtech/bsard", + "revision": "5effa1b9b5fa3b0f9e12523e6e43e5f86a6e6d59", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fra-Latn"], + main_score="recall_at_100", + date=("2021-05-01", "2021-08-26"), + domains=["Legal", "Spoken"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{louis2022statutory, + address = {Dublin, Ireland}, + author = {Louis, Antoine and Spanakis, Gerasimos}, + booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics}, + doi = {10.18653/v1/2022.acl-long.468}, + month = may, + pages = {6789–6803}, + publisher = {Association for Computational Linguistics}, + title = {A Statutory Article Retrieval Dataset in French}, + url = {https://aclanthology.org/2022.acl-long.468/}, + year = {2022}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + # fetch both subsets of the dataset, only test split + corpus_raw = datasets.load_dataset( + name="corpus", + split="corpus", + **self.metadata_dict["dataset"], + ) + queries_raw = datasets.load_dataset( + name="questions", + split=self.metadata.eval_splits[0], + **self.metadata_dict["dataset"], + ) + + split = self.metadata.eval_splits[0] + + self.queries = { + split: { + str(q["id"]): (q["question"] + " " + q["extra_description"]).strip() + for q in queries_raw + } + } + + self.corpus = { + split: {str(d["id"]): {"text": d["article"]} for d in corpus_raw} + } + + self.relevant_docs = {split: {}} + for q in queries_raw: + qid = str(q["id"]) + self.relevant_docs[split][qid] = {} + for doc_id in q["article_ids"]: + self.relevant_docs[split][qid][str(doc_id)] = 1 + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/multilingual/RuSciBenchRetrieval.py b/mteb/tasks/Retrieval/multilingual/RuSciBenchRetrieval.py new file mode 100644 index 0000000000..d70f7527a5 --- /dev/null +++ b/mteb/tasks/Retrieval/multilingual/RuSciBenchRetrieval.py @@ -0,0 +1,197 @@ +from __future__ import annotations + +from collections import defaultdict +from typing import cast + +import datasets + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +_LANGUAGES = { + "ru": ["rus-Cyrl"], + "en": ["eng-Latn"], +} + + +def load_ruscibench_data( + path: str, + langs: list, + eval_splits: list, + cache_dir: str | None = None, + revision: str | None = None, +): + corpus: dict[str, dict[str, dict[str, dict[str, str]] | None]] = { + lang: dict.fromkeys(eval_splits) for lang in langs + } + queries: dict[str, dict[str, dict[str, str] | None]] = { + lang: dict.fromkeys(eval_splits) for lang in langs + } + relevant_docs: dict[str, dict[str, dict[str, dict[str, int]] | None]] = { + lang: dict.fromkeys(eval_splits) for lang in langs + } + + for lang in langs: + lang_corpus = cast( + datasets.Dataset, + datasets.load_dataset( + path, f"corpus-{lang}", cache_dir=cache_dir, revision=revision + ), + )["corpus"] + lang_queries = cast( + datasets.Dataset, + datasets.load_dataset( + path, f"queries-{lang}", cache_dir=cache_dir, revision=revision + ), + )["queries"] + lang_qrels = cast( + datasets.Dataset, + datasets.load_dataset( + path, f"{lang}", cache_dir=cache_dir, revision=revision + ), + )["test"] + corpus[lang] = { + "test": { + str(e["_id"]): {"text": e["text"], "title": e["title"]} + for e in lang_corpus + } + } + queries[lang] = {"test": {str(e["_id"]): e["text"] for e in lang_queries}} + relevant_docs[lang]["test"] = defaultdict(dict) + for item in lang_qrels: + relevant_docs[lang]["test"][str(item["query-id"])].update( + {str(item["corpus-id"]): item["score"]} + ) + corpus = datasets.DatasetDict(corpus) + queries = datasets.DatasetDict(queries) + relevant_docs = datasets.DatasetDict(relevant_docs) + return corpus, queries, relevant_docs + + +class RuSciBenchCiteRetrieval(AbsTaskRetrieval, MultilingualTask): + ignore_identical_ids = True + + metadata = TaskMetadata( + name="RuSciBenchCiteRetrieval", + dataset={ + "path": "mlsa-iai-msu-lab/ru_sci_bench_cite_retrieval", + "revision": "6cb447d02f41b8b775d5d9df7faf472f44d2f1db", + }, + description="""This task is focused on Direct Citation Prediction for scientific papers from eLibrary, + Russia's largest electronic library of scientific publications. Given a query paper (title and abstract), + the goal is to retrieve papers that are directly cited by it from a larger corpus of papers. + The dataset for this task consists of 3,000 query papers, 15,000 relevant (cited) papers, + and 75,000 irrelevant papers. The task is available for both Russian and English scientific texts.""", + reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb", + type="Retrieval", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=_LANGUAGES, + main_score="ndcg_at_10", + date=("2007-01-01", "2023-01-01"), + domains=["Academic", "Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="mit", + dialect=[], + sample_creation="found", + annotations_creators="derived", + bibtex_citation=r""" +@article{vatolin2024ruscibench, + author = {Vatolin, A. and Gerasimenko, N. and Ianina, A. and Vorontsov, K.}, + doi = {10.1134/S1064562424602191}, + issn = {1531-8362}, + journal = {Doklady Mathematics}, + month = {12}, + number = {1}, + pages = {S251--S260}, + title = {RuSciBench: Open Benchmark for Russian and English Scientific Document Representations}, + url = {https://doi.org/10.1134/S1064562424602191}, + volume = {110}, + year = {2024}, +} +""", + prompt={ + "query": "Given a title and abstract of a scientific paper, retrieve the titles and abstracts of other relevant papers", + }, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs = load_ruscibench_data( + path=self.metadata_dict["dataset"]["path"], + langs=self.metadata.eval_langs, + eval_splits=self.metadata_dict["eval_splits"], + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata_dict["dataset"]["revision"], + ) + + self.data_loaded = True + + +class RuSciBenchCociteRetrieval(MultilingualTask, AbsTaskRetrieval): + ignore_identical_ids = True + + metadata = TaskMetadata( + name="RuSciBenchCociteRetrieval", + dataset={ + "path": "mlsa-iai-msu-lab/ru_sci_bench_cocite_retrieval", + "revision": "a5da47a245275669d2b6ddf8f96c5338dd2428b4", + }, + description="""This task focuses on Co-citation Prediction for scientific papers from eLibrary, + Russia's largest electronic library of scientific publications. Given a query paper (title and abstract), + the goal is to retrieve other papers that are co-cited with it. Two papers are considered co-cited + if they are both cited by at least 5 of the same other papers. Similar to the Direct Citation task, + this task employs a retrieval setup: for a given query paper, all other papers in the corpus that + are not co-cited with it are considered negative examples. The task is available for both Russian + and English scientific texts.""", + reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb", + type="Retrieval", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=_LANGUAGES, + main_score="ndcg_at_10", + date=("2007-01-01", "2023-01-01"), + domains=["Academic", "Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="mit", + dialect=[], + sample_creation="found", + annotations_creators="derived", + bibtex_citation=r""" +@article{vatolin2024ruscibench, + author = {Vatolin, A. and Gerasimenko, N. and Ianina, A. and Vorontsov, K.}, + doi = {10.1134/S1064562424602191}, + issn = {1531-8362}, + journal = {Doklady Mathematics}, + month = {12}, + number = {1}, + pages = {S251--S260}, + title = {RuSciBench: Open Benchmark for Russian and English Scientific Document Representations}, + url = {https://doi.org/10.1134/S1064562424602191}, + volume = {110}, + year = {2024}, +} +""", + prompt={ + "query": "Given a title and abstract of a scientific paper, retrieve the titles and abstracts of other relevant papers", + }, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs = load_ruscibench_data( + path=self.metadata_dict["dataset"]["path"], + langs=self.metadata.eval_langs, + eval_splits=self.metadata_dict["eval_splits"], + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata_dict["dataset"]["revision"], + ) + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/nob/snl_retrieval.py b/mteb/tasks/Retrieval/nob/snl_retrieval.py index 2dfe630390..978145aa65 100644 --- a/mteb/tasks/Retrieval/nob/snl_retrieval.py +++ b/mteb/tasks/Retrieval/nob/snl_retrieval.py @@ -11,7 +11,7 @@ class SNLRetrieval(AbsTaskRetrieval): name="SNLRetrieval", dataset={ "path": "adrlau/navjordj-SNL_summarization_copy", # TODO: replace with mteb/SNLRetrieval after #2820 is resolved. - "revision": "3d3d27aa7af8941408cefc3991ada5d12a4273d1", + "revision": "22c474c88fb4678052f9099bb917ad8f9e155f9f", }, description="Webscrabed articles and ingresses from the Norwegian lexicon 'Det Store Norske Leksikon'.", reference="https://huggingface.co/datasets/mteb/SNLRetrieval", diff --git a/mteb/tasks/Retrieval/vie/ArguAnaVNRetrieval.py b/mteb/tasks/Retrieval/vie/ArguAnaVNRetrieval.py new file mode 100644 index 0000000000..4492d9b493 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/ArguAnaVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class ArguAnaVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="ArguAna-VN", + description="""A translated dataset from NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://argumentation.bplaced.net/arguana/data", + dataset={ + "path": "GreenNode/arguana-vn", + "revision": "2a5133a05d7430e6f353497b1624a6e73148105b", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Medical", "Written"], + task_subtypes=["Article retrieval"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["ArguAna"], + ) diff --git a/mteb/tasks/Retrieval/vie/CQADupstackAndroidVNRetrieval.py b/mteb/tasks/Retrieval/vie/CQADupstackAndroidVNRetrieval.py new file mode 100644 index 0000000000..cb177bdf05 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/CQADupstackAndroidVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CQADupstackAndroidVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackAndroid-VN", + description="""A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + dataset={ + "path": "GreenNode/cqadupstack-android-vn", + "revision": "4a022e7213ccc05ee970a176abd0293b3a0a2da0", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Programming", "Web", "Written", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["CQADupstackAndroid"], + ) diff --git a/mteb/tasks/Retrieval/vie/CQADupstackGisVNRetrieval.py b/mteb/tasks/Retrieval/vie/CQADupstackGisVNRetrieval.py new file mode 100644 index 0000000000..2b531eb7d8 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/CQADupstackGisVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CQADupstackGisVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackGis-VN", + description="""A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + dataset={ + "path": "GreenNode/cqadupstack-gis-vn", + "revision": "755156d548a8288efdb29b80bad302750ab33977", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["CQADupstackGis"], + ) diff --git a/mteb/tasks/Retrieval/vie/CQADupstackMathematicaVNRetrieval.py b/mteb/tasks/Retrieval/vie/CQADupstackMathematicaVNRetrieval.py new file mode 100644 index 0000000000..a12c8f1c29 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/CQADupstackMathematicaVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CQADupstackMathematicaVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackMathematica-VN", + description="""A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + dataset={ + "path": "GreenNode/cqadupstack-mathematica-vn", + "revision": "d0cc9b60ba66faa3fb21cb9a54ef969af548b312", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Academic", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["CQADupstackMathematica"], + ) diff --git a/mteb/tasks/Retrieval/vie/CQADupstackPhysicsVNRetrieval.py b/mteb/tasks/Retrieval/vie/CQADupstackPhysicsVNRetrieval.py new file mode 100644 index 0000000000..e10955e22a --- /dev/null +++ b/mteb/tasks/Retrieval/vie/CQADupstackPhysicsVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CQADupstackPhysicsVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackPhysics-VN", + description="""A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + dataset={ + "path": "GreenNode/cqadupstack-physics-vn", + "revision": "8b6b68b59933cc72985f674f76c80a678c27d6be", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Academic", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["CQADupstackPhysics"], + ) diff --git a/mteb/tasks/Retrieval/vie/CQADupstackProgrammersVNRetrieval.py b/mteb/tasks/Retrieval/vie/CQADupstackProgrammersVNRetrieval.py new file mode 100644 index 0000000000..ebe78eeddc --- /dev/null +++ b/mteb/tasks/Retrieval/vie/CQADupstackProgrammersVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CQADupstackProgrammersRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackProgrammers-VN", + description="""A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + dataset={ + "path": "GreenNode/cqadupstack-programmers-vn", + "revision": "1a628c4e61f71ffdb7707d6d4024d25cfe68215a", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Programming", "Written", "Non-fiction"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["CQADupstackProgrammers"], + ) diff --git a/mteb/tasks/Retrieval/vie/CQADupstackStatsVNRetrieval.py b/mteb/tasks/Retrieval/vie/CQADupstackStatsVNRetrieval.py new file mode 100644 index 0000000000..339f9f43d4 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/CQADupstackStatsVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CQADupstackStatsVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackStats-VN", + description="""A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + dataset={ + "path": "GreenNode/cqadupstack-stats-vn", + "revision": "6b8164f3af61f3bb7728724229ba36213fb46c25", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Academic", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["CQADupstackStats"], + ) diff --git a/mteb/tasks/Retrieval/vie/CQADupstackTexVNRetrieval.py b/mteb/tasks/Retrieval/vie/CQADupstackTexVNRetrieval.py new file mode 100644 index 0000000000..ceb699bb88 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/CQADupstackTexVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CQADupstackTexVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackTex-VN", + description="""A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + dataset={ + "path": "GreenNode/cqadupstack-tex-vn", + "revision": "aec43e5ae40451526528b3fc80dd5983ec388e21", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["CQADupstackTex"], + ) diff --git a/mteb/tasks/Retrieval/vie/CQADupstackUnixVNRetrieval.py b/mteb/tasks/Retrieval/vie/CQADupstackUnixVNRetrieval.py new file mode 100644 index 0000000000..dc751d04a9 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/CQADupstackUnixVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CQADupstackUnixVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackUnix-VN", + description="""A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + dataset={ + "path": "GreenNode/cqadupstack-unix-vn", + "revision": "f8b884697871cb38901139f2435c273135f83a3f", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Web", "Programming"], + task_subtypes=["Question answering", "Duplicate Detection"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["CQADupstackUnix"], + ) diff --git a/mteb/tasks/Retrieval/vie/CQADupstackWebmastersVNRetrieval.py b/mteb/tasks/Retrieval/vie/CQADupstackWebmastersVNRetrieval.py new file mode 100644 index 0000000000..03751d00ae --- /dev/null +++ b/mteb/tasks/Retrieval/vie/CQADupstackWebmastersVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CQADupstackWebmastersVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackWebmasters-VN", + description="""A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + dataset={ + "path": "GreenNode/cqadupstack-webmasters-vn", + "revision": "482d6e560d977b137e435d33379c5a8049e70e8d", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Web"], + task_subtypes=["Question answering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["CQADupstackWebmasters"], + ) diff --git a/mteb/tasks/Retrieval/vie/CQADupstackWordpressVNRetrieval.py b/mteb/tasks/Retrieval/vie/CQADupstackWordpressVNRetrieval.py new file mode 100644 index 0000000000..a77659887b --- /dev/null +++ b/mteb/tasks/Retrieval/vie/CQADupstackWordpressVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CQADupstackWordpressVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackWordpress-VN", + dataset={ + "path": "GreenNode/cqadupstack-wordpress-vn", + "revision": "2230f80e1baf42aa005731ca86577621c566fcd7", + }, + description="""A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Web", "Programming"], + task_subtypes=["Question answering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["CQADupstackWordpress"], + ) diff --git a/mteb/tasks/Retrieval/vie/ClimateFEVERVNRetrieval.py b/mteb/tasks/Retrieval/vie/ClimateFEVERVNRetrieval.py new file mode 100644 index 0000000000..9edb81001e --- /dev/null +++ b/mteb/tasks/Retrieval/vie/ClimateFEVERVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class ClimateFEVERVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="ClimateFEVER-VN", + description="""A translated dataset from CLIMATE-FEVER is a dataset adopting the FEVER methodology that consists of 1,535 real-world claims regarding climate-change. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html", + dataset={ + "path": "GreenNode/climate-fever-vn", + "revision": "42328bf787e17b1ad1a88be4f5e87ea9fb668511", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Encyclopaedic", "Written"], + task_subtypes=["Claim verification"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["ClimateFEVER"], + ) diff --git a/mteb/tasks/Retrieval/vie/DBPediaVNRetrieval.py b/mteb/tasks/Retrieval/vie/DBPediaVNRetrieval.py new file mode 100644 index 0000000000..ec01dc8abe --- /dev/null +++ b/mteb/tasks/Retrieval/vie/DBPediaVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class DBPediaVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="DBPedia-VN", + description="""A translated dataset from DBpedia-Entity is a standard test collection for entity search over the DBpedia knowledge base + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://github.com/iai-group/DBpedia-Entity/", + dataset={ + "path": "GreenNode/dbpedia-vn", + "revision": "c3e20179fbcee16217ef9461a14a54b7faca9b63", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Encyclopaedic"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["DBPedia"], + ) diff --git a/mteb/tasks/Retrieval/vie/FEVERVNRetrieval.py b/mteb/tasks/Retrieval/vie/FEVERVNRetrieval.py new file mode 100644 index 0000000000..49fdcf2fe6 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/FEVERVNRetrieval.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class FEVERVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="FEVER-VN", + dataset={ + "path": "GreenNode/fever-vn", + "revision": "a543dd8b98aed3603110c01d26db05ba39b87d49", + }, + description="""A translated dataset from FEVER (Fact Extraction and VERification) consists of 185,445 claims generated by altering sentences + extracted from Wikipedia and subsequently verified without knowledge of the sentence they were + derived from. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://fever.ai/", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Encyclopaedic", "Written"], + task_subtypes=["Claim verification"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["FEVER"], + ) diff --git a/mteb/tasks/Retrieval/vie/FiQA2018VNRetrieval.py b/mteb/tasks/Retrieval/vie/FiQA2018VNRetrieval.py new file mode 100644 index 0000000000..a039aa53c9 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/FiQA2018VNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class FiQA2018VN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="FiQA2018-VN", + description="""A translated dataset from Financial Opinion Mining and Question Answering + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://sites.google.com/view/fiqa/", + dataset={ + "path": "GreenNode/fiqa-vn", + "revision": "6c3cdf6f102151dbbbbc1d2cf999b305eba44dae", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Financial"], + task_subtypes=["Question answering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["FiQA2018"], + ) diff --git a/mteb/tasks/Retrieval/vie/HotpotQAVNRetrieval.py b/mteb/tasks/Retrieval/vie/HotpotQAVNRetrieval.py new file mode 100644 index 0000000000..13de52d87a --- /dev/null +++ b/mteb/tasks/Retrieval/vie/HotpotQAVNRetrieval.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class HotpotQAVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="HotpotQA-VN", + dataset={ + "path": "GreenNode/hotpotqa-vn", + "revision": "8a5220c7af5084f0d5d2afeb74f9c2b41b759ff0", + }, + description="""A translated dataset from HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong + supervision for supporting facts to enable more explainable question answering systems. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://hotpotqa.github.io/", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Web", "Written"], + task_subtypes=["Question answering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["HotpotQA"], + ) diff --git a/mteb/tasks/Retrieval/vie/MSMARCOVNRetrieval.py b/mteb/tasks/Retrieval/vie/MSMARCOVNRetrieval.py new file mode 100644 index 0000000000..2fbb5140f4 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/MSMARCOVNRetrieval.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class MSMARCOVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="MSMARCO-VN", + dataset={ + "path": "GreenNode/msmarco-vn", + "revision": "85d1ad4cc9070b8d019d65f5af1631a2ab91e294", + }, + description="""A translated dataset from MS MARCO is a collection of datasets focused on deep learning in search + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://microsoft.github.io/msmarco/", + type="Retrieval", + category="s2p", + eval_splits=["dev"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=[ + "Encyclopaedic", + "Academic", + "Blog", + "News", + "Medical", + "Government", + "Reviews", + "Non-fiction", + "Social", + "Web", + ], + task_subtypes=["Question answering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["MSMARCO"], + ) diff --git a/mteb/tasks/Retrieval/vie/NFCorpusVNRetrieval.py b/mteb/tasks/Retrieval/vie/NFCorpusVNRetrieval.py new file mode 100644 index 0000000000..7086eb5b7a --- /dev/null +++ b/mteb/tasks/Retrieval/vie/NFCorpusVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class NFCorpusVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NFCorpus-VN", + dataset={ + "path": "GreenNode/nfcorpus-vn", + "revision": "a13d72fbb859be3dc19ab669d1ec9510407d2dcd", + }, + description="""A translated dataset from NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Medical", "Academic", "Written"], + task_subtypes=["Article retrieval"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["NFCorpus"], + ) diff --git a/mteb/tasks/Retrieval/vie/NQVNRetrieval.py b/mteb/tasks/Retrieval/vie/NQVNRetrieval.py new file mode 100644 index 0000000000..5bb940c167 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/NQVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class NQVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NQ-VN", + dataset={ + "path": "GreenNode/nq-vn", + "revision": "40a6d7f343b9c9f4855a426d8c431ad5f8aaf56b", + }, + description="""A translated dataset from NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://ai.google.com/research/NaturalQuestions/", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Encyclopaedic"], + task_subtypes=["Question answering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["NQ"], + ) diff --git a/mteb/tasks/Retrieval/vie/QuoraVNRetrieval.py b/mteb/tasks/Retrieval/vie/QuoraVNRetrieval.py new file mode 100644 index 0000000000..d3e357f871 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/QuoraVNRetrieval.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class QuoraVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="Quora-VN", + dataset={ + "path": "GreenNode/quora-vn", + "revision": "3363d81e41b67c1032bf3b234882a03d271e2289", + }, + description="""A translated dataset from QuoraRetrieval is based on questions that are marked as duplicates on the Quora platform. Given a + question, find other (duplicate) questions. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs", + type="Retrieval", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Web", "Blog"], + task_subtypes=["Question answering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["Quora"], + ) diff --git a/mteb/tasks/Retrieval/vie/SCIDOCSVNRetrieval.py b/mteb/tasks/Retrieval/vie/SCIDOCSVNRetrieval.py new file mode 100644 index 0000000000..93088401c9 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/SCIDOCSVNRetrieval.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SCIDOCSVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="SCIDOCS-VN", + dataset={ + "path": "GreenNode/scidocs-vn", + "revision": "724cddfa9d328a193f303a0a9b7789468ac79f26", + }, + description="""A translated dataset from SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from citation + prediction, to document classification and recommendation. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://allenai.org/data/scidocs", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Academic", "Written", "Non-fiction"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["SCIDOCS"], + ) diff --git a/mteb/tasks/Retrieval/vie/SciFactVNRetrieval.py b/mteb/tasks/Retrieval/vie/SciFactVNRetrieval.py new file mode 100644 index 0000000000..d2d5034742 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/SciFactVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SciFactVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="SciFact-VN", + dataset={ + "path": "GreenNode/scifact-vn", + "revision": "483a7cf890c523c954e7751d328c5bb65061dcff", + }, + description="""A translated dataset from SciFact verifies scientific claims using evidence from the research literature containing scientific paper abstracts. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://github.com/allenai/scifact", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Academic", "Medical", "Written"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["SciFact"], + ) diff --git a/mteb/tasks/Retrieval/vie/TRECCOVIDVNRetrieval.py b/mteb/tasks/Retrieval/vie/TRECCOVIDVNRetrieval.py new file mode 100644 index 0000000000..ef9f69224a --- /dev/null +++ b/mteb/tasks/Retrieval/vie/TRECCOVIDVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class TRECCOVIDVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="TRECCOVID-VN", + description="""A translated dataset from TRECCOVID is an ad-hoc search challenge based on the COVID-19 dataset containing scientific articles related to the COVID-19 pandemic. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://ir.nist.gov/covidSubmit/index.html", + dataset={ + "path": "GreenNode/trec-covid-vn", + "revision": "54d73a1ea11ea0ae4ec0214ec519c93db79dee88", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Medical", "Academic", "Written"], + task_subtypes=["Article retrieval"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["TRECCOVID"], + ) diff --git a/mteb/tasks/Retrieval/vie/Touche2020VNRetrieval.py b/mteb/tasks/Retrieval/vie/Touche2020VNRetrieval.py new file mode 100644 index 0000000000..8011850d66 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/Touche2020VNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class Touche2020VN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="Touche2020-VN", + description="""A translated dataset from Touché Task 1: Argument Retrieval for Controversial Questions + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://webis.de/events/touche-20/shared-task-1.html", + dataset={ + "path": "GreenNode/webis-touche2020-vn", + "revision": "cd4389b182aec622c8121ee8db988359197159c1", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Academic"], + task_subtypes=["Question answering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["Touche2020"], + ) diff --git a/mteb/tasks/STS/__init__.py b/mteb/tasks/STS/__init__.py index 471789f1c9..00ccf0d7ff 100644 --- a/mteb/tasks/STS/__init__.py +++ b/mteb/tasks/STS/__init__.py @@ -29,4 +29,7 @@ from .rus.RUParaPhraserSTS import * from .rus.RuSTSBenchmarkSTS import * from .spa.STSES import * +from .vie.BiossesSTSVN import * +from .vie.SickrSTSVN import * +from .vie.STSBenchmarkSTSVN import * from .zho.CMTEBSTS import * diff --git a/mteb/tasks/STS/vie/BiossesSTSVN.py b/mteb/tasks/STS/vie/BiossesSTSVN.py new file mode 100644 index 0000000000..dcc1819289 --- /dev/null +++ b/mteb/tasks/STS/vie/BiossesSTSVN.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskSTS import AbsTaskSTS +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class BiossesSTSVN(AbsTaskSTS): + metadata = TaskMetadata( + name="BIOSSES-VN", + dataset={ + "path": "GreenNode/biosses-sts-vn", + "revision": "1dae4a6df91c0852680cd4ab48c8c1d8a9ed49b2", + }, + description="""A translated dataset from Biomedical Semantic Similarity Estimation. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html", + type="STS", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="cosine_spearman", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Medical"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["BIOSSES"], + ) + + @property + def metadata_dict(self) -> dict[str, str]: + metadata_dict = super().metadata_dict + metadata_dict["min_score"] = 0 + metadata_dict["max_score"] = 5 + return metadata_dict diff --git a/mteb/tasks/STS/vie/STSBenchmarkSTSVN.py b/mteb/tasks/STS/vie/STSBenchmarkSTSVN.py new file mode 100644 index 0000000000..1c7879a01d --- /dev/null +++ b/mteb/tasks/STS/vie/STSBenchmarkSTSVN.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskSTS import AbsTaskSTS +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class STSBenchmarkSTSVN(AbsTaskSTS): + metadata = TaskMetadata( + name="STSBenchmark-VN", + dataset={ + "path": "GreenNode/stsbenchmark-sts-vn", + "revision": "f24d66738cda4a02138ada5af7689a92ce1fcad6", + }, + description="""A translated dataset from Semantic Textual Similarity Benchmark (STSbenchmark) dataset. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://github.com/PhilipMay/stsb-multi-mt/", + type="STS", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="cosine_spearman", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Blog", "News", "Written"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["STSBenchmark"], + ) + + @property + def metadata_dict(self) -> dict[str, str]: + metadata_dict = super().metadata_dict + metadata_dict["min_score"] = 0 + metadata_dict["max_score"] = 5 + return metadata_dict diff --git a/mteb/tasks/STS/vie/SickrSTSVN.py b/mteb/tasks/STS/vie/SickrSTSVN.py new file mode 100644 index 0000000000..93c5d585d2 --- /dev/null +++ b/mteb/tasks/STS/vie/SickrSTSVN.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskSTS import AbsTaskSTS +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SickrSTSVN(AbsTaskSTS): + metadata = TaskMetadata( + name="SICK-R-VN", + dataset={ + "path": "GreenNode/sickr-sts-vn", + "revision": "bc89f0401983c456b609f7fb324278f346b2cccf", + }, + description="""A translated dataset from Semantic Textual Similarity SICK-R dataset as described here: + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://aclanthology.org/2020.lrec-1.207", + type="STS", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="cosine_spearman", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Web", "Written"], + task_subtypes=["Textual Entailment"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["SICK-R"], + ) + + @property + def metadata_dict(self) -> dict[str, str]: + metadata_dict = super().metadata_dict + metadata_dict["min_score"] = 0 + metadata_dict["max_score"] = 5 + return metadata_dict diff --git a/mteb/tasks/STS/vie/__init__.py b/mteb/tasks/STS/vie/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/__init__.py b/mteb/tasks/__init__.py index 01d17012a0..39bf266568 100644 --- a/mteb/tasks/__init__.py +++ b/mteb/tasks/__init__.py @@ -20,6 +20,7 @@ from .InstructionRetrieval import * from .MultiLabelClassification import * from .PairClassification import * +from .Regression import * from .Reranking import * from .Retrieval import * from .SpeedTask import * diff --git a/pyproject.toml b/pyproject.toml index 3d39bb0496..2c44dfbd8b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.38.33" +version = "1.38.46" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ @@ -100,7 +100,7 @@ timm = ["timm>=1.0.15,<1.1.0"] open_clip_torch = ["open_clip_torch==2.31.0"] xet = ["huggingface_hub>=0.32.0"] ark = ["volcengine-python-sdk[ark]==3.0.2", "tiktoken>=0.8.0"] -colpali_engine = ["colpali_engine>=0.3.10"] +colpali_engine = ["colpali_engine>=0.3.12"] muq = ["muq==0.1.0"] [tool.coverage.report] diff --git a/scripts/data/clean_and_update_tasks.py b/scripts/data/clean_and_update_tasks.py new file mode 100644 index 0000000000..dac4c8ca5c --- /dev/null +++ b/scripts/data/clean_and_update_tasks.py @@ -0,0 +1,1216 @@ +from __future__ import annotations + +import ast +import importlib.util +import re +import traceback +import warnings +from dataclasses import dataclass +from pathlib import Path +from typing import Literal + +import datasets +import orjson +import pandas as pd +import typer +from datasets import Dataset, DatasetDict +from huggingface_hub import HfApi +from tqdm import tqdm + +""" +This script is designed for data cleaning and the automatic creation of new task versions with updated data. +Currently, it supports only monolingual classification tasks. + +The results of this script can be seen in the following PRs: +- https://github.com/embeddings-benchmark/mteb/pull/2632 +- https://github.com/embeddings-benchmark/mteb/pull/2900 +""" + +app = typer.Typer() +datasets.logging.set_verbosity_error() +datasets.logging.disable_progress_bar() + + +@dataclass +class TaskMetadataInfo: + class_name: str + name: str + version: int + dataset: dict[str, str] + eval_splits: list[str] + eval_langs: list[str] + + +def format_scores( + scores_files: list[Path], + filter_multiple_model_versions: bool = False, + name_type: Literal["model", "parent"] = "model", + filter_missing_scores: bool = False, + use_all_subsets: bool = False, +) -> pd.DataFrame: + if filter_multiple_model_versions: + latest_files: dict[tuple[str, str], Path] = {} + for scores_file in scores_files: + model_name = ( + scores_file.parent.parent.name + if name_type == "model" + else scores_file.parent.parent.parent.name + ) + task_id = scores_file.name + key = (model_name, task_id) + if ( + key not in latest_files + or scores_file.stat().st_mtime > latest_files[key].stat().st_mtime + ): + latest_files[key] = scores_file + filtered_scores_files = list(latest_files.values()) + else: + filtered_scores_files = scores_files + + scores_data = [] + for scores_file in filtered_scores_files: + try: + s = orjson.loads(scores_file.read_bytes().replace(b"NaN", b"null")) + if use_all_subsets: + for subset in s["scores"]: + scores_data.extend( + { + "task_name": s["task_name"], + "evaluation_time": s.get( + "evaluation_time", None + ), # Handle missing eval time + "model_name": scores_file.parent.parent.name + if name_type == "model" + else scores_file.parent.parent.parent.name, + "subset": subset, + **score, + } + for score in s["scores"][subset] + ) + + else: + score_set = s["scores"].get( + "test", s["scores"].get("dev", s["scores"].get("train")) + ) + if score_set is None: + warnings.warn( + f"No 'test' or 'dev' or 'train' scores found in {scores_file}", + stacklevel=2, + ) + continue + + scores_data.extend( + { + "task_name": s["task_name"], + "evaluation_time": s.get("evaluation_time", None), + "model_name": scores_file.parent.parent.name + if name_type == "model" + else scores_file.parent.parent.parent.name, + **score, + } + for score in score_set + ) + except Exception as e: + warnings.warn(f"Error processing file {scores_file}: {e}", stacklevel=2) + + if not scores_data: + return pd.DataFrame() + + scores = pd.DataFrame(scores_data) + scores["languages"] = scores["languages"].apply( + lambda x: ",".join(x) if isinstance(x, list) else x + ) + scores["model_name_original"] = scores["model_name"].str.replace("__", "/") + + scores = scores.drop_duplicates( + subset=["task_name", "model_name", "languages", "hf_subset", "main_score"] + ) + scores = scores.sort_values(["task_name", "main_score"], ascending=[True, False]) + + mode_count = scores.groupby("model_name")["main_score"].count().mode().iloc[0] + model_counts = scores.groupby("model_name")["main_score"].count() + filtered_models = model_counts[model_counts < mode_count].index.tolist() + + if filtered_models: + print( + f"WARNING: The following models have fewer scores than the mode ({mode_count}):" + ) + for model in filtered_models: + print(f" - {model}: {model_counts[model]} scores") + + if filter_missing_scores: + scores = scores[~scores["model_name"].isin(filtered_models)] + + return scores + + +def find_class_node( + module: ast.Module, class_name: str +) -> tuple[ast.ClassDef | None, int]: + for i, node in enumerate(module.body): + if isinstance(node, ast.ClassDef) and node.name == class_name: + return node, i + return None, -1 + + +def find_latest_class_name(module: ast.Module) -> str | None: + version_pattern = re.compile(r"^(?P.+?)(?:V(?P\d+))?$") + groups: dict[str, list[tuple[int, ast.ClassDef, int]]] = {} + for idx, node in enumerate(module.body): + if isinstance(node, ast.ClassDef): + m = version_pattern.match(node.name) + if not m: + continue + base = m.group("base") + ver = int(m.group("ver")) if m.group("ver") else 1 + groups.setdefault(base, []).append((ver, node, idx)) + if not groups: + return None + base_class_name, _ = max(groups.items(), key=lambda kv: max(e[0] for e in kv[1])) + return base_class_name + + +def read_lines(file_path: Path) -> list[str]: + return file_path.read_text().splitlines(keepends=True) + + +def write_lines(file_path: Path, lines: list[str]) -> None: + file_path.write_text("".join(lines)) + + +def resolve_local_variable( + var_name: str, func_node: ast.FunctionDef +) -> ast.expr | None: + # Walk backwards to find the last assignment + for stmt in reversed(func_node.body): + if isinstance(stmt, ast.Assign): + for target in stmt.targets: + if isinstance(target, ast.Name) and target.id == var_name: + return stmt.value + return None + + +def update_class_header( + new_block: list[str], old_name: str, base_name: str, new_suffix: str +) -> list[str]: + header_re = re.compile(r"^(\s*class\s+)" + re.escape(old_name) + r"(\b.*)$") + new_block[0] = header_re.sub(r"\1" + base_name + new_suffix + r"\2", new_block[0]) + return new_block + + +def update_task_metadata( + new_block: list[str], old_name: str, base_name: str, new_meta_suffix: str +) -> list[str]: + within = False + parens = 0 + metadata_start_index = -1 + metadata_end_index = -1 + indent = -1 + adapted_from_index = -1 + + for i, line in enumerate(new_block): + if not within and re.match(r"\s*metadata\s*=\s*TaskMetadata\s*\(", line): + within = True + metadata_start_index = i + parens = line.count("(") - line.count(")") + if line.strip().endswith("()"): + parens = 0 + metadata_end_index = i + within = False + elif within: + if "adapted_from" in line: + adapted_from_index = i + parens += line.count("(") - line.count(")") + m = re.match(r'^(\s*name\s*=\s*")([^"]*)(".*)$', line) + if m: + base_name = re.sub(r"\.v\d+$", "", m.group(2)) + new_block[i] = ( + m.group(1) + base_name + new_meta_suffix + m.group(3) + "\n" + ) + if indent == -1 and line.strip() and not line.strip().startswith("#"): + indent = len(line) - len(line.lstrip(" ")) + if parens <= 0: + within = False + metadata_end_index = i + + if metadata_start_index != -1: + if adapted_from_index != -1: + line = new_block[adapted_from_index] + line_indent = len(line) - len(line.lstrip(" ")) + new_block[adapted_from_index] = ( + f'{" " * line_indent}adapted_from=["{old_name}"],\n' + ) + else: + if indent == -1: + line = new_block[metadata_end_index] + indent = (len(line) - len(line.lstrip())) + 4 + + line_to_insert_before_idx = metadata_end_index + # handle case where last line is just ')' + if new_block[line_to_insert_before_idx].strip() == ")": + line_to_insert_before_idx -= 1 + + last_line_idx = -1 + for i in range(line_to_insert_before_idx, -1, -1): + if new_block[i].strip(): + last_line_idx = i + break + + if last_line_idx != -1: + if not new_block[last_line_idx].rstrip().endswith(","): + new_block[last_line_idx] = new_block[last_line_idx].rstrip() + ",\n" + + new_line = " " * indent + f'adapted_from=["{old_name}"],\n' + new_block.insert(metadata_end_index, new_line) + return new_block + + +def handle_dataset_transform( + new_block: list[str], block: list[str], ds: DatasetDict +) -> list[str]: + text = "".join(new_block) + module = ast.parse(text) + class_node = next(node for node in module.body if isinstance(node, ast.ClassDef)) + + transform_node = None + for node in class_node.body: + if isinstance(node, ast.FunctionDef) and node.name == "dataset_transform": + transform_node = node + break + + if transform_node: + subsampling_calls = [] + for node in ast.walk(transform_node): + if ( + isinstance(node, ast.Call) + and isinstance(node.func, ast.Attribute) + and node.func.attr == "stratified_subsampling" + ): + subsampling_calls.append(node) + + if not subsampling_calls: + start = transform_node.lineno - 1 + end = transform_node.end_lineno + del new_block[start:end] + else: + n_samples = 2048 + splits = ["test"] + for call in subsampling_calls: + for kw in call.keywords: + if kw.arg == "n_samples": + value_node = kw.value + if isinstance(value_node, ast.Name): + resolved_node = resolve_local_variable( + value_node.id, transform_node + ) + if resolved_node: + value_node = resolved_node + try: + n_samples = ast.literal_eval(value_node) + except (ValueError, TypeError): + pass # Keep default + elif kw.arg == "splits": + splits = ast.literal_eval(kw.value) + + needs_subsampling = any( + len(ds[split]) > n_samples for split in splits if split in ds + ) + + if needs_subsampling: + original_class_lines = "".join(block) + original_class_module = ast.parse(original_class_lines) + original_class_node = next( + node + for node in original_class_module.body + if isinstance(node, ast.ClassDef) + ) + + original_transform_node = None + for node in original_class_node.body: + if ( + isinstance(node, ast.FunctionDef) + and node.name == "dataset_transform" + ): + original_transform_node = node + break + + new_transform_body = [] + if original_transform_node: + for stmt in original_transform_node.body: + is_subsampling_stmt = False + for node in ast.walk(stmt): + if ( + isinstance(node, ast.Call) + and isinstance(node.func, ast.Attribute) + and node.func.attr == "stratified_subsampling" + ): + is_subsampling_stmt = True + break + if is_subsampling_stmt: + stmt_start_line = stmt.lineno - 1 + stmt_end_line = ( + stmt.end_lineno + if stmt.end_lineno + else stmt_start_line + 1 + ) + new_transform_body.extend( + block[stmt_start_line:stmt_end_line] + ) + + start = ( + transform_node.body[0].lineno - 1 + if transform_node.body + else transform_node.lineno + ) + end = transform_node.end_lineno + del new_block[start:end] + + if new_transform_body: + indent = ( + " " * (transform_node.body[0].col_offset) + if transform_node.body + else " " * (transform_node.col_offset + 4) + ) + new_transform_body_lines = [ + f"{indent}{line.lstrip()}" for line in new_transform_body + ] + new_block.insert(start, "".join(new_transform_body_lines)) + + else: + start = transform_node.lineno - 1 + end = transform_node.end_lineno + del new_block[start:end] + return new_block + + +def get_v2_block( + block: list[str], + old_name: str, + base_name: str, + new_suffix: str, + new_meta_suffix: str, + ds: DatasetDict, +) -> list[str]: + new_block = block.copy() + new_block = update_class_header(new_block, old_name, base_name, new_suffix) + new_block = update_task_metadata(new_block, old_name, base_name, new_meta_suffix) + new_block = handle_dataset_transform(new_block, block, ds) + return new_block + + +def deduplicate(dataset: Dataset) -> Dataset: + unique_texts = set() + indices_to_keep = [] + for i, text in enumerate(dataset["text"]): + text = text.strip() + if text not in unique_texts: + unique_texts.add(text) + indices_to_keep.append(i) + + return dataset.select(indices_to_keep) + + +def filter_empty(dataset: Dataset) -> Dataset: + return dataset.filter(lambda x: len(x["text"].strip()) > 0) + + +def filter_leakage(train_dataset: Dataset, test_dataset: Dataset) -> Dataset: + train_texts = set(train_dataset["text"]) + test_indices_no_leakage = [ + i for i, text in enumerate(test_dataset["text"]) if text not in train_texts + ] + return test_dataset.select(test_indices_no_leakage) + + +def filter_controversial(dataset_dict: DatasetDict) -> DatasetDict: + normalized: dict[str, set[str | tuple[str, ...]]] = {} + for _, ds in dataset_dict.items(): + for text, label in zip(ds["text"], ds["label"]): + key = text.strip().lower() + normalized.setdefault(key, set()).add( + label if isinstance(label, (str, int, float)) else tuple(label) + ) + bad_texts = {t for t, labels in normalized.items() if len(labels) > 1} + return DatasetDict( + { + split: ds.filter(lambda x: x["text"].strip().lower() not in bad_texts) + for split, ds in dataset_dict.items() + } + ) + + +def filter_short(dataset: Dataset, min_words: int = 3) -> Dataset: + return dataset.filter(lambda x: len(x["text"].strip().split()) >= min_words) + + +def calculate_inner_indent(lines: list[str], node: ast.ClassDef) -> int: + indent = len(lines[node.lineno - 1]) - len(lines[node.lineno - 1].lstrip(" ")) + for line in lines[node.lineno : node.end_lineno]: + if line.strip(): + return len(line) - len(line.lstrip(" ")) + return indent + 4 + + +def load_and_transform(file_path: Path, metadata: TaskMetadataInfo) -> DatasetDict: + return load_dataset(file_path, metadata.class_name) + + +def split_train_test( + ds: DatasetDict, metadata: TaskMetadataInfo +) -> tuple[DatasetDict, bool, list[tuple[str, str, int]]]: + report: list[tuple[str, str, int]] = [] + is_changed = False + if "train" in ds and metadata.eval_splits == "train": + is_changed = True + before = len(ds["train"]) + ds["train"] = ds["train"].cast_column( + "label", datasets.ClassLabel(names=list(set(ds["train"]["label"]))) + ) + label_counts = pd.Series(ds["train"]["label"]).value_counts() + one_sample_labels = set(label_counts[label_counts == 1].index.tolist()) + + if len(one_sample_labels) > 0: + before_size = len(ds["train"]) + ds["train"] = ds["train"].filter( + lambda x: x["label"] not in one_sample_labels + ) + removed = before_size - len(ds["train"]) + if removed > 0: + report.append(("filter_one_sample_labels", "train", removed)) + + splits = ds["train"].train_test_split( + test_size=min(2048, before // 2), seed=42, stratify_by_column="label" + ) + ds = DatasetDict({"train": splits["train"], "test": splits["test"]}) + report.append(("create_test_split", "train_to_test", before - len(ds["train"]))) + metadata.eval_splits = ["test"] + return ds, is_changed, report + + +def clean_dataset( + ds: DatasetDict, + metadata: TaskMetadataInfo, +) -> tuple[DatasetDict, list[tuple[str, str, int]], bool]: + report: list[tuple[str, str, int]] = [] + is_changed = False + + skip_codes = {"zho", "jpn", "tha", "mya", "cmn"} + apply_short = not any( + lang.split("-")[0] in skip_codes for lang in metadata.eval_langs + ) + + transforms = [ + ("filter_empty", filter_empty), + ("deduplicate", deduplicate), + ] + if apply_short: + transforms.append(("filter_short", filter_short)) + + for split in ["train", *metadata.eval_splits]: + if split not in ds: + continue + for name, fn in transforms: + before = len(ds[split]) + ds[split] = fn(ds[split]) + removed = before - len(ds[split]) + if removed > 0: + is_changed = True + report.append((name, split, removed)) + + ds, is_changed_after_split, split_report = split_train_test(ds, metadata) + report.extend(split_report) + is_changed = is_changed or is_changed_after_split + + for split in metadata.eval_splits: + if split == "train": + continue + before_test = len(ds[split]) + ds["test"] = filter_leakage(ds["train"], ds[split]) + removed = before_test - len(ds[split]) + if removed > 0: + is_changed = True + report.append(("filter_leakage", split, removed)) + + orig = {split: len(ds[split]) for split in ds} + ds = filter_controversial(ds) + for split in ds: + removed = orig[split] - len(ds[split]) + if removed > 0: + is_changed = True + report.append(("filter_controversial", split, removed)) + + return ds, report, is_changed + + +def print_report( + report_folder: Path, + language: str, + original_records: list[tuple[str, str, int]], + filter_records: list[tuple[str, str, str, int]], +) -> None: + report_lines: list[str] = [] + report_lines.append("## Original Sizes") + report_lines.append("| Task | Split | Original Size |") + report_lines.append("|------|:-----:|--------------:|") + for task, split, size in original_records: + report_lines.append(f"| {task} | {split} | {size} |") + + report_lines.append("") + report_lines.append("## Cleaning Report") + report_lines.append("| Task | Filter | Split | Removed |") + report_lines.append("|------|--------|:-----:|--------:|") + for task, name, split, removed in filter_records: + report_lines.append(f"| {task} | {name} | {split} | {removed} |") + + (report_folder / f"report_{language}.md").write_text("\n".join(report_lines)) + + +def push_dataset(ds: DatasetDict, metadata: TaskMetadataInfo, username: str) -> str: + prev_path = metadata.dataset.get("path", "") + if prev_path.startswith("mteb/") and username != "mteb": + repo_id = prev_path.replace("mteb/", f"{username}/") + else: + base = metadata.class_name + if base.endswith("Classification"): + base = base[: -len("Classification")] + name = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", base) + name = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", name).lower() + repo_id = f"{username}/{name}" + ds.push_to_hub(repo_id, config_name=metadata.dataset.get("name", "default")) + return repo_id + + +def update_metadata( + file_path: Path, class_name: str, new_ver: int, repo_id: str, pr_id: int +) -> None: + api = HfApi() + commit = api.list_repo_commits(repo_id=repo_id, repo_type="dataset")[0].commit_id + update_v2_metadata_dataset( + file_path, class_name + f"V{new_ver}", repo_id, commit, pr_id + ) + + +def parse_metadata_dataset(file_path: Path, class_name: str) -> dict[str, str]: + source = file_path.read_text() + module = ast.parse(source) + class_node, _ = find_class_node(module, class_name) + for node in class_node.body: + if isinstance(node, ast.Assign): + target = node.targets[0] + if isinstance(target, ast.Name) and target.id == "metadata": + call = node.value + for kw in call.keywords: + if kw.arg == "dataset": + return ast.literal_eval(kw.value) + return {} + + +def get_transform_statements(file_path: Path, class_name: str) -> list[ast.stmt]: + source = file_path.read_text() + module = ast.parse(source) + class_node, _ = find_class_node(module, class_name) + for node in class_node.body: + if isinstance(node, ast.FunctionDef) and node.name == "dataset_transform": + return [ + stmt + for stmt in node.body + if not ( + isinstance(stmt, ast.Assign) + and isinstance(stmt.value, ast.Call) + and isinstance(stmt.value.func, ast.Attribute) + and stmt.value.func.attr == "stratified_subsampling" + ) + ] + return [] + + +def load_dataset(file_path: Path, class_name: str) -> DatasetDict: # noqa: F811 + original = file_path.read_text() + lines = original.splitlines(keepends=True) + filtered: list[str] = [] + skip = False + parens = 0 + + for line in lines: + if not skip and "stratified_subsampling" in line: + skip = True + parens = line.count("(") - line.count(")") + continue + if skip: + parens += line.count("(") - line.count(")") + if parens <= 0: + skip = False + filtered.append(" pass") + continue + filtered.append(line) + + file_path.write_text("".join(filtered)) + + spec = importlib.util.spec_from_file_location("task_module", str(file_path)) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + TaskClass = getattr(module, class_name) + task = TaskClass() + task.load_data() + ds = task.dataset + + file_path.write_text(original) + + return ds + + +def _find_metadata_assignment(class_node: ast.ClassDef) -> ast.Assign | None: + for stmt in class_node.body: + if ( + isinstance(stmt, ast.Assign) + and isinstance(stmt.targets[0], ast.Name) + and stmt.targets[0].id == "metadata" + ): + return stmt + return None + + +def _find_keyword(call_node: ast.Call, keyword_name: str) -> ast.keyword | None: + for kw in call_node.keywords: + if kw.arg == keyword_name: + return kw + return None + + +def _get_indent(line: str) -> str: + return line[: len(line) - len(line.lstrip())] + + +def _update_description( + lines: list[str], call_node: ast.Call, pr_id: int +) -> tuple[list[str], list[int]]: + desc_kw = _find_keyword(call_node, "description") + if not desc_kw or not isinstance(desc_kw.value, ast.Constant): + return lines, [] + + value_node = desc_kw.value + original_desc = value_node.value + start_line_idx = desc_kw.lineno - 1 + end_line_idx = value_node.end_lineno - 1 + + indent = _get_indent(lines[start_line_idx]) + new_desc_val = f'"""{original_desc}\n This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/{pr_id})"""' + lines[start_line_idx] = f"{indent}description={new_desc_val},\n" + + deleted_indices = list(range(start_line_idx + 1, end_line_idx + 1)) + return lines, deleted_indices + + +def _update_dataset_dict( + lines: list[str], call_node: ast.Call, new_path: str, new_revision: str +) -> tuple[list[str], list[int]]: + dataset_kw = _find_keyword(call_node, "dataset") + if not dataset_kw or not isinstance(dataset_kw.value, ast.Dict): + return lines, [] + + dict_node = dataset_kw.value + lines_to_delete = [] + + for i, key_node in enumerate(dict_node.keys): + if not isinstance(key_node, ast.Constant): + continue + + value_node = dict_node.values[i] + line_idx = key_node.lineno - 1 + indent = _get_indent(lines[line_idx]) + key = key_node.value + + if key == "path": + lines[line_idx] = f'{indent}"path": "{new_path}",\n' + elif key == "revision": + lines[line_idx] = f'{indent}"revision": "{new_revision}",\n' + elif key == "trust_remote_code": + lines_to_delete.extend(range(line_idx, value_node.end_lineno)) + + return lines, lines_to_delete + + +def _update_eval_splits( + lines: list[str], call_node: ast.Call, module: ast.Module +) -> list[str]: + eval_splits_kw = _find_keyword(call_node, "eval_splits") + if not eval_splits_kw: + return lines + + value_node = eval_splits_kw.value + if isinstance(value_node, ast.Name): + resolved = _resolve_variable(value_node.id, module) + if resolved: + value_node = resolved + + is_train_split = ( + isinstance(value_node, ast.List) + and len(value_node.elts) == 1 + and isinstance(value_node.elts[0], ast.Constant) + and value_node.elts[0].value == "train" + ) + + if is_train_split: + line_idx = eval_splits_kw.lineno - 1 + indent = _get_indent(lines[line_idx]) + lines[line_idx] = f'{indent}eval_splits=["test"],\n' + + return lines + + +def update_v2_metadata_dataset( + file_path: Path, class_name: str, new_path: str, new_revision: str, pr_id: int +) -> None: + lines = read_lines(file_path) + module = ast.parse("".join(lines)) + + class_node, _ = find_class_node(module, class_name) + if not class_node: + raise ValueError(f"Class {class_name} not found in {file_path}") + + metadata_node = _find_metadata_assignment(class_node) + if not metadata_node or not isinstance(metadata_node.value, ast.Call): + return + + call_node = metadata_node.value + lines, desc_deleted = _update_description(lines, call_node, pr_id) + lines, ds_deleted = _update_dataset_dict(lines, call_node, new_path, new_revision) + lines = _update_eval_splits(lines, call_node, module) + + all_deleted_indices = sorted(set(desc_deleted + ds_deleted), reverse=True) + for i in all_deleted_indices: + del lines[i] + + write_lines(file_path, lines) + + +def _resolve_variable(name: str, module: ast.Module) -> ast.expr | None: + for node in module.body: + if isinstance(node, ast.Assign): + for target in node.targets: + if isinstance(target, ast.Name) and target.id == name: + return node.value + return None + + +def parse_all_task_metadata( + file_path: Path, latest_version: bool = True +) -> list[TaskMetadataInfo]: + source = file_path.read_text() + module = ast.parse(source) + + version_pattern = re.compile(r"^(?P.+?)(?:V(?P\d+))?$") + all_tasks: list[TaskMetadataInfo] = [] + + for node in module.body: + if not isinstance(node, ast.ClassDef): + continue + + m = version_pattern.match(node.name) + if not m: + continue + ver = int(m.group("ver")) if m.group("ver") else 1 + + name = "" + dataset: dict[str, str] = {} + eval_split: list[str] = ["test"] + eval_langs: list[str] = [] + for stmt in node.body: + if not ( + isinstance(stmt, ast.Assign) + and isinstance(stmt.targets[0], ast.Name) + and stmt.targets[0].id == "metadata" + ): + continue + if not isinstance(stmt.value, ast.Call): + continue + call = stmt.value + for kw in call.keywords: + value_node = kw.value + if isinstance(value_node, ast.Name): + resolved_node = _resolve_variable(value_node.id, module) + if resolved_node: + value_node = resolved_node + try: + if kw.arg == "name" and isinstance(value_node, ast.Constant): + name = value_node.value + elif kw.arg == "dataset": + dataset = ast.literal_eval(value_node) + elif kw.arg == "eval_splits": + eval_split = ast.literal_eval(value_node) or ["test"] + elif kw.arg == "eval_langs": + eval_langs = ast.literal_eval(value_node) + except (ValueError, SyntaxError): + pass + break + + if not name: + continue + + all_tasks.append( + TaskMetadataInfo(node.name, name, ver, dataset, eval_split, eval_langs) + ) + + if not latest_version: + return all_tasks + + latest: dict[str, TaskMetadataInfo] = {} + for task in all_tasks: + base_name = re.sub(r"V\d+$", "", task.name) + if base_name not in latest or task.version > latest[base_name].version: + latest[base_name] = task + + return list(latest.values()) + + +def parse_all_task_metadata_versions(file_path: Path) -> list[TaskMetadataInfo]: + return parse_all_task_metadata(file_path, latest_version=False) + + +def bump_version_for_class( + file_path: Path, base_class_name: str, ds: DatasetDict +) -> int: + lines = read_lines(file_path) + module = ast.parse("".join(lines)) + + version_pattern = re.compile(rf"^{re.escape(base_class_name)}(?:V(?P\d+))?$") + selected: tuple[int, ast.ClassDef] | None = None + for node in module.body: + if not isinstance(node, ast.ClassDef): + continue + m = version_pattern.match(node.name) + if not m: + continue + ver = int(m.group("ver")) if m.group("ver") else 1 + if selected is None or ver > selected[0]: + selected = (ver, node) + if selected is None: + raise ValueError(f"Class {base_class_name} not found in {file_path}") + + version, node = selected + inner = calculate_inner_indent(lines, node) + new_version = version + 1 + + task_name = "" + for stmt in node.body: + if ( + isinstance(stmt, ast.Assign) + and isinstance(stmt.targets[0], ast.Name) + and stmt.targets[0].id == "metadata" + ): + call = stmt.value + if ( + isinstance(call, ast.Call) + and isinstance(call.func, ast.Name) + and call.func.id == "TaskMetadata" + ): + for kw in call.keywords: + if kw.arg == "name" and isinstance(kw.value, ast.Constant): + task_name = kw.value.value + break + break + + superseded = " " * inner + f'superseded_by = "{task_name}.v{new_version}"\n' + block = lines[node.lineno - 1 : node.end_lineno] + v2_block = get_v2_block( + block, node.name, base_class_name, f"V{new_version}", f".v{new_version}", ds + ) + + new_lines: list[str] = [] + for i, l in enumerate(lines): + new_lines.append(l) + if i == node.lineno - 1: + new_lines.append(superseded) + if i == node.end_lineno - 1: + new_lines.append("\n") + new_lines.extend(v2_block) + write_lines(file_path, new_lines) + return new_version + + +def process_task( + file_path: Path, + metadata: TaskMetadataInfo, + pr_id: int, + username: str, + verbose: bool, +) -> tuple[ + tuple[str, int] | None, + list[tuple[str, str, int]], + list[tuple[str, str, str, int]], +]: + if verbose: + print(" task ->", metadata.class_name) + try: + ds = load_and_transform(file_path, metadata) + except Exception: + print(metadata.class_name, "dataset loading failed") + traceback.print_exc() + return None, [], [] + + if verbose: + print(ds) + + original_size = {split: len(ds[split]) for split in ds} + ds_cleaned, report, is_changed = clean_dataset(ds.copy(), metadata) + if verbose: + print(f"is_changed: {is_changed}") + + if not is_changed: + if verbose: + print(f"{metadata.class_name} is unchanged") + return None, [], [] + + original_records = [ + (metadata.name, split, size) for split, size in original_size.items() + ] + filter_records = [ + (metadata.name, name, split, removed) for name, split, removed in report + ] + + repo_id = push_dataset(ds_cleaned, metadata, username) + base_name = re.sub(r"V\d+$", "", metadata.class_name) + new_ver = bump_version_for_class(file_path, base_name, ds) + update_metadata(file_path, base_name, new_ver, repo_id, pr_id) + + return (metadata.name, new_ver), original_records, filter_records + + +@app.command() +def create_and_prepare( + folder: Path = typer.Argument(..., exists=True, dir_okay=True), + pr_id: int = typer.Argument(..., help="Pull request ID"), + report_folder: Path = typer.Option( + "scripts/data/cleaning_reports", exists=True, dir_okay=True + ), + username: str = "mteb", + start_lang: str | None = None, + verbose: bool = typer.Option(False, "--verbose"), +) -> None: + changed_tasks: list[tuple[str, int]] = [] + all_original_records: list[tuple[str, str, int]] = [] + all_filter_records: list[tuple[str, str, str, int]] = [] + + files_to_process = sorted( + p for p in folder.glob("**/*.py") if p.name != "__init__.py" + ) + if start_lang: + files_to_process = [p for p in files_to_process if p.parent.name >= start_lang] + progress_bar = tqdm(files_to_process, desc="Processing files") + + try: + for file_path in progress_bar: + progress_bar.set_description(f"Processing {file_path.name}") + if verbose: + print("working on", file_path.name) + + for metadata in parse_all_task_metadata(file_path): + changed_task, original_records, filter_records = process_task( + file_path, metadata, pr_id, username, verbose + ) + if changed_task: + changed_tasks.append(changed_task) + all_original_records.extend(original_records) + all_filter_records.extend(filter_records) + except Exception: + print(traceback.format_exc()) + + if changed_tasks: + print_report( + report_folder, folder.name, all_original_records, all_filter_records + ) + + unique_changed = sorted(set(changed_tasks)) + tasks_str = " ".join( + f"{task_name} {task_name}.v{version}" + for task_name, version in unique_changed + ) + print( + "mteb run -m intfloat/multilingual-e5-small -t" + f" {tasks_str} && mteb run -m" + " sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 -t" + f" {tasks_str}" + ) + + +@app.command() +def compare_results( + results_dir: Path = typer.Option( + "/home/admin/vatolin/experiments/mteb/results", exists=True, dir_okay=True + ), + tasks_file: Path | None = typer.Option( + None, + "--tasks-file", + "-f", + exists=True, + file_okay=True, + dir_okay=False, + help="File with a list of tasks to compare. One task per line.", + ), +) -> None: + models = [ + "intfloat/multilingual-e5-small", + "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", + ] + scores_files = [ + f for f in results_dir.glob("**/**/*.json") if f.stem not in {"model_meta"} + ] + df_all: pd.DataFrame = format_scores(scores_files, use_all_subsets=True) + + required_tasks: set[str] | None = None + if tasks_file: + required_tasks = { + line.strip() for line in tasks_file.read_text().splitlines() if line.strip() + } + + for model in models: + df_model = df_all[df_all["model_name"] == model.replace("/", "__")] + + df_old = df_model[ + ~df_model["task_name"].str.contains(r"\.v\d+$", regex=True) + ].copy() + df_new = df_model[ + df_model["task_name"].str.contains(r"\.v\d+$", regex=True) + ].copy() + + df_new["task_name"] = df_new["task_name"].str.replace( + r"\.v\d+$", "", regex=True + ) + + if required_tasks: + available_for_comparison = set(df_old["task_name"]).intersection( + set(df_new["task_name"]) + ) + missing_tasks = required_tasks - available_for_comparison + if missing_tasks: + print(f"**{model}**") + print( + f"Skipping due to missing tasks: {', '.join(sorted(missing_tasks))}" + ) + print() + continue + + df_old = df_old[df_old["task_name"].isin(required_tasks)] + df_new = df_new[df_new["task_name"].isin(required_tasks)] + + old_duplicated = df_old.duplicated(subset=["task_name", "subset"]) + if old_duplicated.sum() > 0: + print("Duplicated scores") + print(model) + print(df_old[old_duplicated][["task_name", "subset", "languages"]]) + continue + new_duplicated = df_new.duplicated(subset=["task_name", "subset"]) + if new_duplicated.sum() > 0: + print(model) + print(df_new[new_duplicated][["task_name", "subset", "languages"]]) + continue + df_old = df_old.set_index(["task_name", "subset"])["main_score"] + df_new = df_new.set_index(["task_name", "subset"])["main_score"] + df_cmp = pd.DataFrame( + { + "main_score_old": df_old, + "main_score_new": df_new, + } + ).dropna() + + if df_cmp.empty: + continue + + df_cmp["delta_percent"] = ( + (df_cmp["main_score_new"] - df_cmp["main_score_old"]) + / df_cmp["main_score_old"] + * 100 + ).round(2) + df_cmp = df_cmp.reset_index(drop=False).sort_values(["task_name", "subset"]) + + print(f"**{model}**") + print( + df_cmp[ + [ + "task_name", + "subset", + "main_score_old", + "main_score_new", + "delta_percent", + ] + ].to_markdown(index=False) + ) + print() + + +@app.command() +def report_cleaning( + tasks: list[str] = typer.Argument( + ..., help="List of task names to generate cleaning reports for." + ), + folder: Path = typer.Option( + "mteb/tasks/Classification", exists=True, dir_okay=True + ), + report_folder: Path = typer.Option( + "scripts/data/cleaning_reports", exists=True, dir_okay=True + ), + verbose: bool = typer.Option(False, "--verbose"), +) -> None: + all_original_records: list[tuple[str, str, int]] = [] + all_filter_records: list[tuple[str, str, str, int]] = [] + + all_tasks_map: dict[str, tuple[TaskMetadataInfo, Path]] = {} + files_to_process = sorted( + p for p in folder.glob("**/*.py") if p.name != "__init__.py" + ) + + for file_path in files_to_process: + tasks_in_file = parse_all_task_metadata_versions(file_path) + for task_metadata in tasks_in_file: + all_tasks_map[task_metadata.name] = (task_metadata, file_path) + + for task_name in tasks: + v2_task_name = f"{task_name}.v2" + if v2_task_name not in all_tasks_map: + if verbose: + print(f"Task {v2_task_name} not found, skipping.") + continue + + if task_name not in all_tasks_map: + if verbose: + print(f"Base task {task_name} not found, skipping.") + continue + + v1_metadata, file_path = all_tasks_map[task_name] + if verbose: + print(f"Processing {task_name} from {file_path}") + + try: + ds = load_and_transform(file_path, v1_metadata) + print(ds) + except Exception: + print(f"Dataset loading failed for {v1_metadata.class_name}") + traceback.print_exc() + continue + + original_size = {split: len(ds[split]) for split in ds} + ds_new, report, _ = clean_dataset(ds.copy(), v1_metadata) + print(ds_new) + print(report) + + original_records = [ + (v1_metadata.name, split, size) for split, size in original_size.items() + ] + filter_records = [ + (v1_metadata.name, name, split, removed) for name, split, removed in report + ] + + all_original_records.extend(original_records) + all_filter_records.extend(filter_records) + + if all_filter_records: + print_report( + report_folder, folder.name, all_original_records, all_filter_records + ) + print(f"Report generated in {report_folder}/report_{folder.name}.md") + else: + print("No tasks with v2 versions found or no changes after cleaning.") + + +if __name__ == "__main__": + app() diff --git a/scripts/data/common_voice/process_data.py b/scripts/data/common_voice/process_data.py index 22610499c8..4e7e0dc49d 100644 --- a/scripts/data/common_voice/process_data.py +++ b/scripts/data/common_voice/process_data.py @@ -532,7 +532,7 @@ def compute_stats_from_tsv(language_code, split_clips, corpus_dir): sentence_domain_dict = dict(sentence_domain_counts) # Ensure it's a regular dict and not a defaultdict by creating a new dict - sentence_domain_dict = {k: v for k, v in sentence_domain_dict.items()} + sentence_domain_dict = dict(sentence_domain_dict.items()) # Ensure specific domain keys exist with a value of 0 if they don't exist in the data required_domain_keys = [ @@ -639,7 +639,7 @@ def parse_stats_dict(content): # If JSON parsing fails, try ast.literal_eval (handles single quotes) try: return ast.literal_eval(full_dict_str) - except: + except Exception: pass # If both fail, try to normalize quotes and parse as JSON @@ -654,7 +654,7 @@ def parse_stats_dict(content): # Try parsing again try: return json.loads(normalized) - except: + except Exception: pass return None diff --git a/scripts/data/common_voice/upload_data.py b/scripts/data/common_voice/upload_data.py index d928b24caa..e35a53854e 100644 --- a/scripts/data/common_voice/upload_data.py +++ b/scripts/data/common_voice/upload_data.py @@ -7,7 +7,6 @@ import os import time from pathlib import Path -from typing import List, Optional, Tuple from huggingface_hub import HfApi, list_repo_files, login from huggingface_hub.utils import RepositoryNotFoundError @@ -66,8 +65,8 @@ def parse_args(): def discover_files( - base_dir: str, languages: Optional[List[str]] = None -) -> List[Tuple[str, str]]: + base_dir: str, languages: list[str] | None = None +) -> list[tuple[str, str]]: """Discover all tar and tsv files in the base directory. Returns: @@ -215,10 +214,7 @@ def main(): # Calculate total size total_size = 0 for local_path, _ in files_to_upload: - try: - total_size += os.path.getsize(local_path) - except: - pass + total_size += os.path.getsize(local_path) print(f"Total size to upload: {format_size(total_size)}") diff --git a/tests/test_TaskMetadata.py b/tests/test_TaskMetadata.py index bcf9a1f83d..22bf88e1df 100644 --- a/tests/test_TaskMetadata.py +++ b/tests/test_TaskMetadata.py @@ -13,13 +13,20 @@ _HISTORIC_DATASETS = [ "PolEmo2.0-IN", "PolEmo2.0-OUT", + "PolEmo2.0-OUT.v2", "PAC", + "PAC.v2", "TNews", + "TNews.v2", "IFlyTek", + "IFlyTek.v2", "MultilingualSentiment", + "MultilingualSentiment.v2", "JDReview", + "JDReview.v2", "OnlineShopping", "Waimai", + "Waimai.v2", "BlurbsClusteringP2P", "BlurbsClusteringS2S", "TenKGnadClusteringP2P", diff --git a/tests/test_benchmark/mock_models.py b/tests/test_benchmark/mock_models.py index bc5804a50a..79bb5a5431 100644 --- a/tests/test_benchmark/mock_models.py +++ b/tests/test_benchmark/mock_models.py @@ -20,11 +20,14 @@ class MockNumpyEncoder(mteb.Encoder): - def __init__(self): - pass + def __init__(self, seed: int | None = None): + if seed is not None: + self.rng = np.random.default_rng(seed) + else: + self.rng = np.random.default_rng() def encode(self, sentences, prompt_name: str | None = None, **kwargs): - return np.random.rand(len(sentences), 10) + return self.rng.random((len(sentences), 10)) class MockTorchEncoder(mteb.Encoder): @@ -230,7 +233,7 @@ def __init__( model: The SentenceTransformer model to use. Can be a string (model name), a SentenceTransformer model, or a CrossEncoder model. revision: The revision of the model to use. model_prompts: A dictionary mapping task names to prompt names. - First priority is given to the composed prompt of task name + prompt type (query or passage), then to the specific task prompt, + First priority is given to the composed prompt of task name + prompt type (query or document), then to the specific task prompt, then to the composed prompt of task type + prompt type, then to the specific task type prompt, and finally to the specific prompt type. **kwargs: Additional arguments to pass to the SentenceTransformer model. diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py index 11b5f4cc7f..69487b2805 100644 --- a/tests/test_benchmark/test_benchmark.py +++ b/tests/test_benchmark/test_benchmark.py @@ -115,7 +115,7 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs): class EncoderWithoutInstructions(MockSentenceTransformer): def encode(self, sentences, **kwargs): - assert kwargs["prompt_name"] is None + assert kwargs["prompt"] is None return super().encode(sentences, **kwargs) if isinstance(task_name, mteb.AbsTask): @@ -302,7 +302,7 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs): ) -@pytest.mark.parametrize("task_name", ["NQ-NL-query", "NQ-NL-passage"]) +@pytest.mark.parametrize("task_name", ["NQ-NL-query", "NQ-NL-document"]) def test_prompt_name_split_correctly(task_name: str, tmp_path: Path): """Test that the task name is split correctly into task name and prompt type for tasks with multiple `-` in their names. @@ -331,12 +331,12 @@ def test_model_query_passage_prompts_task_type( task_name = task.metadata.name if is_task_name else task.metadata.type def check_prompt(prompt_name, is_query): - prompt_type = "query" if is_query else "passage" + prompt_type = "query" if is_query else "document" assert prompt_name == f"{task_name}-{prompt_type}" prompt_list = { f"{task_name}-query": "query", - f"{task_name}-passage": "passage", + f"{task_name}-document": "document", } class MockEncoderWithPrompts(mteb.Encoder): diff --git a/tests/test_evaluators/test_ClassificationEvaluator.py b/tests/test_evaluators/test_ClassificationEvaluator.py new file mode 100644 index 0000000000..31b179be5b --- /dev/null +++ b/tests/test_evaluators/test_ClassificationEvaluator.py @@ -0,0 +1,135 @@ +from __future__ import annotations + +import numpy as np +import pytest + +from mteb.evaluation.evaluators import logRegClassificationEvaluator +from tests.test_benchmark.mock_models import MockNumpyEncoder +from tests.test_benchmark.mock_tasks import MockClassificationTask + + +def is_binary_classification(y_train: list[int], y_test: list[int]) -> bool: + """Check if the classification task is binary based on the labels.""" + all_labels = set(y_train + y_test) + return len(all_labels) == 2 + + +# Fixtures +@pytest.fixture +def model(): + return MockNumpyEncoder(seed=42) + + +@pytest.fixture +def mock_task(): + task = MockClassificationTask() + task.load_data() + return task + + +def test_output_structure(model, mock_task): + """Test that the evaluator returns the expected output structure.""" + train_data = mock_task.dataset["train"] + test_data = mock_task.dataset["test"] + + evaluator = logRegClassificationEvaluator( + train_data["text"], + np.array(train_data["label"]), + test_data["text"], + np.array(test_data["label"]), + task_name="test_classification", + ) + scores, test_cache = evaluator(model) + + # Check basic structure + assert isinstance(scores, dict) + assert isinstance(test_cache, np.ndarray) + + # Check required metrics + assert "accuracy" in scores + assert "f1" in scores + assert "f1_weighted" in scores + + # Check binary-specific metrics (MockClassificationTask is binary) + is_binary = is_binary_classification(train_data["label"], test_data["label"]) + if is_binary: + assert "ap" in scores + assert "ap_weighted" in scores + else: + assert "ap" not in scores + + +def test_expected_scores(model, mock_task): + """Test that the evaluator returns expected scores with deterministic model.""" + train_data = mock_task.dataset["train"] + test_data = mock_task.dataset["test"] + + evaluator = logRegClassificationEvaluator( + train_data["text"], + np.array(train_data["label"]), + test_data["text"], + np.array(test_data["label"]), + task_name="test_classification", + ) + scores, _ = evaluator(model) + + # Check that we get reasonable scores (MockClassificationTask has deterministic data) + assert 0.0 <= scores["accuracy"] <= 1.0 + assert 0.0 <= scores["f1"] <= 1.0 + assert 0.0 <= scores["f1_weighted"] <= 1.0 + + +def test_cache_usage_binary(): + """Test that embedding caching works correctly for binary classification. + + This test verifies the caching mechanism used to avoid re-encoding the same + sentences multiple times. The workflow is: + + 1. Run a first evaluation which encodes test sentences and returns embeddings cache + 2. Run a second evaluation with the same test sentences, passing the cache from step 1 + 3. Verify that the cache is preserved (not modified) during the second evaluation + 4. Verify that the second evaluation still produces valid classification results + + The cache contains the encoded embeddings for the test sentences, allowing the + evaluator to skip the encoding step when the same sentences are evaluated again. + This is particularly useful when running multiple evaluations on the same dataset + with different models or parameters. + """ + mock_task = MockClassificationTask() + mock_task.load_data() + train_data = mock_task.dataset["train"] + test_data = mock_task.dataset["test"] + + model = MockNumpyEncoder(seed=42) + + # First evaluation to generate cache + evaluator_initial = logRegClassificationEvaluator( + train_data["text"], + np.array(train_data["label"]), + test_data["text"], + np.array(test_data["label"]), + task_name="test_binary_cache", + ) + _, test_cache_initial = evaluator_initial(model) + + # Second evaluation using cache + evaluator_with_cache = logRegClassificationEvaluator( + train_data["text"], + np.array(train_data["label"]), + test_data["text"], + np.array(test_data["label"]), + task_name="test_binary_cache_usage", + ) + scores_with_cache, test_cache_after_cache_usage = evaluator_with_cache( + model, test_cache=test_cache_initial + ) + + # Verify cache is preserved + assert np.array_equal(test_cache_initial, test_cache_after_cache_usage) + + # Verify that scores are returned (structure check only) + assert "accuracy" in scores_with_cache + assert "f1" in scores_with_cache + assert "f1_weighted" in scores_with_cache + assert "ap" in scores_with_cache + assert "ap_weighted" in scores_with_cache diff --git a/tests/test_evaluators/test_STSEvaluator.py b/tests/test_evaluators/test_STSEvaluator.py new file mode 100644 index 0000000000..aee3963724 --- /dev/null +++ b/tests/test_evaluators/test_STSEvaluator.py @@ -0,0 +1,138 @@ +from __future__ import annotations + +import pytest + +from mteb.evaluation.evaluators.STSEvaluator import STSEvaluator +from tests.test_benchmark.mock_models import MockNumpyEncoder +from tests.test_benchmark.mock_tasks import MockSTSTask + + +# Fixtures +@pytest.fixture +def model(): + return MockNumpyEncoder(seed=42) + + +@pytest.fixture +def mock_task(): + task = MockSTSTask() + task.load_data() + return task + + +def test_output_structure(model, mock_task): + """Test that the evaluator returns the expected output structure and scores.""" + test_data = mock_task.dataset["test"] + + evaluator = STSEvaluator( + sentences1=test_data["sentence1"], + sentences2=test_data["sentence2"], + gold_scores=test_data["score"], + task_name="test_sts", + ) + scores = evaluator(model) + + # Check basic structure + assert isinstance(scores, dict) + + # Check required metrics + assert "pearson" in scores + assert "spearman" in scores + assert "cosine_pearson" in scores + assert "cosine_spearman" in scores + assert "manhattan_pearson" in scores + assert "manhattan_spearman" in scores + assert "euclidean_pearson" in scores + assert "euclidean_spearman" in scores + + # Check exact score values with deterministic model + assert scores["pearson"] == -1.0 + assert scores["spearman"] == -0.9999999999999999 + assert scores["cosine_pearson"] == -1.0 + assert scores["cosine_spearman"] == -0.9999999999999999 + assert scores["manhattan_pearson"] == -1.0 + assert scores["manhattan_spearman"] == -0.9999999999999999 + assert scores["euclidean_pearson"] == -1.0 + assert scores["euclidean_spearman"] == -0.9999999999999999 + + +def test_basic_functionality(model, mock_task): + """Test basic functionality and proper initialization.""" + test_data = mock_task.dataset["test"] + + evaluator = STSEvaluator( + sentences1=test_data["sentence1"], + sentences2=test_data["sentence2"], + gold_scores=test_data["score"], + task_name="test_sts", + ) + + # Check that data is properly stored + assert evaluator.sentences1 == test_data["sentence1"] + assert evaluator.sentences2 == test_data["sentence2"] + assert evaluator.gold_scores == test_data["score"] + assert evaluator.task_name == "test_sts" + + +def test_batch_size_parameter(mock_task): + """Test that the batch_size parameter in encode_kwargs works correctly.""" + from tests.test_benchmark.mock_models import MockNumpyEncoder + + # Create a mock encoder that respects batch_size and tracks batch calls + class BatchTrackingMockEncoder(MockNumpyEncoder): + def __init__(self, seed=42): + super().__init__(seed) + self.batch_calls = [] # Track each batch call + + def encode(self, sentences, prompt_name=None, **kwargs): + batch_size = kwargs.get("batch_size", 32) + + # Track individual batch calls + for i in range(0, len(sentences), batch_size): + batch = sentences[i : i + batch_size] + self.batch_calls.append(len(batch)) + + return super().encode(sentences, prompt_name, **kwargs) + + test_data = mock_task.dataset["test"] + + evaluator = STSEvaluator( + sentences1=test_data["sentence1"], + sentences2=test_data["sentence2"], + gold_scores=test_data["score"], + task_name="test_sts", + ) + + # Test with batch_size=1 - should process texts one at a time + tracking_model_1 = BatchTrackingMockEncoder() + scores_batch_1 = evaluator(tracking_model_1, encode_kwargs={"batch_size": 1}) + + # Calculate expected batch calls + num_sentences1 = len(test_data["sentence1"]) + num_sentences2 = len(test_data["sentence2"]) + + # With batch_size=1, each text should be processed in its own batch + expected_batch_1_calls = num_sentences1 + num_sentences2 + assert len(tracking_model_1.batch_calls) == expected_batch_1_calls + assert all(batch_size == 1 for batch_size in tracking_model_1.batch_calls) + + # Test with batch_size=2 - should process texts in pairs (mostly) + tracking_model_2 = BatchTrackingMockEncoder() + scores_batch_2 = evaluator(tracking_model_2, encode_kwargs={"batch_size": 2}) + + # Calculate expected batch calls for batch_size=2 + import math + + expected_batch_2_calls = math.ceil(num_sentences1 / 2) + math.ceil( + num_sentences2 / 2 + ) + assert len(tracking_model_2.batch_calls) == expected_batch_2_calls + # Each batch should have at most 2 items (last batch might have 1) + assert all(batch_size <= 2 for batch_size in tracking_model_2.batch_calls) + assert all(batch_size >= 1 for batch_size in tracking_model_2.batch_calls) + + # Check that evaluation works with custom batch sizes + assert isinstance(scores_batch_1, dict) + assert isinstance(scores_batch_2, dict) + assert "pearson" in scores_batch_1 + assert "pearson" in scores_batch_2 diff --git a/tests/test_evaluators/test_SummarizationEvaluator.py b/tests/test_evaluators/test_SummarizationEvaluator.py new file mode 100644 index 0000000000..f58537d528 --- /dev/null +++ b/tests/test_evaluators/test_SummarizationEvaluator.py @@ -0,0 +1,175 @@ +from __future__ import annotations + +import pytest + +from mteb.evaluation.evaluators.SummarizationEvaluator import SummarizationEvaluator +from tests.test_benchmark.mock_models import MockNumpyEncoder +from tests.test_benchmark.mock_tasks import MockSummarizationTask + + +# Fixtures +@pytest.fixture +def model(): + return MockNumpyEncoder(seed=42) + + +@pytest.fixture +def mock_task(): + task = MockSummarizationTask() + task.load_data() + return task + + +def test_output_structure(model, mock_task): + """Test that the evaluator returns the expected output structure and scores.""" + test_data = mock_task.dataset["test"] + + evaluator = SummarizationEvaluator( + human_summaries=test_data["human_summaries"], + machine_summaries=test_data["machine_summaries"], + gold_scores=test_data["relevance"], + task_name="test_summarization", + ) + scores = evaluator(model) + + # Check basic structure + assert isinstance(scores, dict) + + # Check required metrics + assert "pearson" in scores + assert "spearman" in scores + assert "cosine_pearson" in scores + assert "cosine_spearman" in scores + assert "dot_pearson" in scores + assert "dot_spearman" in scores + + # Check exact score values with deterministic model + assert scores["pearson"] == -1.0 + assert scores["spearman"] == -0.9999999999999999 + assert scores["cosine_pearson"] == -1.0 + assert scores["cosine_spearman"] == -0.9999999999999999 + assert scores["dot_pearson"] == 0.0 + assert scores["dot_spearman"] == 0.0 + + +def test_basic_functionality(model, mock_task): + """Test basic functionality and proper initialization.""" + test_data = mock_task.dataset["test"] + + evaluator = SummarizationEvaluator( + human_summaries=test_data["human_summaries"], + machine_summaries=test_data["machine_summaries"], + gold_scores=test_data["relevance"], + task_name="test_summarization", + ) + + # Check that data is properly stored + assert evaluator.human_summaries == test_data["human_summaries"] + assert evaluator.machine_summaries == test_data["machine_summaries"] + assert evaluator.gold_scores == test_data["relevance"] + assert evaluator.task_name == "test_summarization" + + +def test_encode_kwargs_handling(model, mock_task): + """Test that encode_kwargs are properly handled.""" + test_data = mock_task.dataset["test"] + + evaluator = SummarizationEvaluator( + human_summaries=test_data["human_summaries"], + machine_summaries=test_data["machine_summaries"], + gold_scores=test_data["relevance"], + task_name="test_summarization", + ) + + # Test that the evaluator accepts encode_kwargs + scores = evaluator(model, encode_kwargs={"batch_size": 16}) + assert isinstance(scores, dict) + assert "pearson" in scores + + +def test_batch_size_parameter(mock_task): + """Test that the batch_size parameter in encode_kwargs works correctly.""" + from tests.test_benchmark.mock_models import MockNumpyEncoder + + # Create a mock encoder that respects batch_size and tracks batch calls + class BatchTrackingMockEncoder(MockNumpyEncoder): + def __init__(self, seed=42): + super().__init__(seed) + self.batch_calls = [] # Track each batch call + + def encode(self, sentences, prompt_name=None, **kwargs): + batch_size = kwargs.get("batch_size", 32) + + # Track individual batch calls + for i in range(0, len(sentences), batch_size): + batch = sentences[i : i + batch_size] + self.batch_calls.append(len(batch)) + + return super().encode(sentences, prompt_name, **kwargs) + + test_data = mock_task.dataset["test"] + + evaluator = SummarizationEvaluator( + human_summaries=test_data["human_summaries"], + machine_summaries=test_data["machine_summaries"], + gold_scores=test_data["relevance"], + task_name="test_summarization", + ) + + # Test with batch_size=1 - should process texts one at a time + tracking_model_1 = BatchTrackingMockEncoder() + scores_batch_1 = evaluator(tracking_model_1, encode_kwargs={"batch_size": 1}) + + # Calculate expected batch calls + total_human_texts = sum(len(hs) for hs in test_data["human_summaries"]) + total_machine_texts = sum(len(ms) for ms in test_data["machine_summaries"]) + + # With batch_size=1, each text should be processed in its own batch + expected_batch_1_calls = total_human_texts + total_machine_texts + assert len(tracking_model_1.batch_calls) == expected_batch_1_calls + assert all(batch_size == 1 for batch_size in tracking_model_1.batch_calls) + + # Test with batch_size=2 - should process texts in pairs (mostly) + tracking_model_2 = BatchTrackingMockEncoder() + scores_batch_2 = evaluator(tracking_model_2, encode_kwargs={"batch_size": 2}) + + # Calculate expected batch calls for batch_size=2 + import math + + expected_batch_2_calls = math.ceil(total_human_texts / 2) + math.ceil( + total_machine_texts / 2 + ) + assert len(tracking_model_2.batch_calls) == expected_batch_2_calls + # Each batch should have at most 2 items (last batch might have 1) + assert all(1 <= batch_size <= 2 for batch_size in tracking_model_2.batch_calls) + + # Check that evaluation works with custom batch sizes + assert isinstance(scores_batch_1, dict) + assert isinstance(scores_batch_2, dict) + assert "pearson" in scores_batch_1 + assert "pearson" in scores_batch_2 + + +def test_empty_scores_handling(model, mock_task): + """Test that the evaluator handles cases where some samples have equal scores.""" + test_data = mock_task.dataset["test"] + + # Create a case where some gold scores are identical + modified_gold_scores = test_data["relevance"].copy() + if len(modified_gold_scores) > 0 and len(modified_gold_scores[0]) > 1: + # Make all scores in the first sample identical + modified_gold_scores[0] = [modified_gold_scores[0][0]] * len( + modified_gold_scores[0] + ) + + evaluator = SummarizationEvaluator( + human_summaries=test_data["human_summaries"], + machine_summaries=test_data["machine_summaries"], + gold_scores=modified_gold_scores, + task_name="test_summarization_equal_scores", + ) + + # Should still work even with some samples having equal scores + scores = evaluator(model) + assert isinstance(scores, dict) + assert "pearson" in scores diff --git a/tests/test_models/test_model_meta.py b/tests/test_models/test_model_meta.py index f0c54dd99c..3a2b214068 100644 --- a/tests/test_models/test_model_meta.py +++ b/tests/test_models/test_model_meta.py @@ -62,6 +62,7 @@ def test_model_similar_tasks(training_datasets): "Touche2020", "Touche2020-Fa", "Touche2020-NL", + "Touche2020-VN", "Touche2020Retrieval.v3", ] assert sorted(dummy_model_meta.get_training_datasets().keys()) == expected diff --git a/tests/test_reproducible_workflow.py b/tests/test_reproducible_workflow.py index d584a98852..738392c623 100644 --- a/tests/test_reproducible_workflow.py +++ b/tests/test_reproducible_workflow.py @@ -57,10 +57,12 @@ def test_validate_task_to_prompt_name(task_name: str | mteb.AbsTask): model_prompts = dict.fromkeys(task_names, "prompt_name") model_prompts |= {task_name + "-query": "prompt_name" for task_name in task_names} - model_prompts |= {task_name + "-passage": "prompt_name" for task_name in task_names} + model_prompts |= { + task_name + "-document": "prompt_name" for task_name in task_names + } model_prompts |= { "query": "prompt_name", - "passage": "prompt_name", + "document": "prompt_name", } Wrapper.validate_task_to_prompt_name(model_prompts) diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py index 98bb0697f4..1269382af5 100644 --- a/tests/test_tasks/test_all_abstasks.py +++ b/tests/test_tasks/test_all_abstasks.py @@ -81,8 +81,8 @@ def test_load_data( @pytest.mark.test_datasets @pytest.mark.flaky( - reruns=3, - reruns_delay=5, + reruns=5, + reruns_delay=12, only_rerun=["AssertionError"], reason="May fail due to network issues", )