diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index be1d1c7418..71ed2554f9 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -4,11 +4,18 @@ -## Checklist +### Code Quality +- [ ] **Code Formatted**: Format the code using `make lint` to maintain consistent style. -- [ ] Run tests locally to make sure nothing is broken using `make test`. -- [ ] Run the formatter to format the code using `make lint`. +### Documentation + +- [ ] **Updated Documentation**: Add or update documentation to reflect the changes introduced in this PR. + +### Testing + +- [ ] **New Tests Added**: Write tests to cover new functionality. Validate with `make test-with-coverage`. +- [ ] **Tests Passed**: Run tests locally using `make test` or `make test-with-coverage` to ensure no existing functionality is broken. ### Adding datasets checklist diff --git a/.gitignore b/.gitignore index 977fe8dc1a..d5cc51748b 100644 --- a/.gitignore +++ b/.gitignore @@ -147,4 +147,5 @@ results/ uv.lock # model loading tests -model_names.txt \ No newline at end of file +model_names.txt +mteb/leaderboard/__cached_results.json diff --git a/README.md b/README.md index f556cad894..59cc5da9e2 100644 --- a/README.md +++ b/README.md @@ -472,24 +472,24 @@ evaluation.run(model, ...) ## Documentation -| Documentation | | -| ------------------------------ | ---------------------- | -| ๐Ÿ“‹ [Tasks] |ย Overview of available tasks | -| ๐Ÿ“ [Benchmarks] | Overview of available benchmarks | -| ๐Ÿ“ˆ [Leaderboard] | The interactive leaderboard of the benchmark | -| ๐Ÿค– [Adding a model] | Information related to how to submit a model to the leaderboard | +| Documentation | | +|--------------------------------|-------------------------------------------------------------------------------------| +| ๐Ÿ“‹ [Tasks] | Overview of available tasks | +| ๐Ÿ“ [Benchmarks] | Overview of available benchmarks | +| ๐Ÿ“ˆ [Leaderboard] | The interactive leaderboard of the benchmark | +| ๐Ÿค– [Adding a model] | Information related to how to submit a model to MTEB and to the leaderboard | | ๐Ÿ‘ฉโ€๐Ÿ”ฌ [Reproducible workflows] | Information related to how to reproduce and create reproducible workflows with MTEB | -| ๐Ÿ‘ฉโ€๐Ÿ’ป [Adding a dataset] | How to add a new task/dataset to MTEB |ย  -| ๐Ÿ‘ฉโ€๐Ÿ’ป [Adding a leaderboard tab] | How to add a new leaderboard tab to MTEB |ย  -| ๐Ÿค [Contributing] | How to contribute to MTEB and set it up for development | -| ๐ŸŒ [MMTEB] | An open-source effort to extend MTEB to cover a broad set of languages | ย  +| ๐Ÿ‘ฉโ€๐Ÿ’ป [Adding a dataset] | How to add a new task/dataset to MTEB | +| ๐Ÿ‘ฉโ€๐Ÿ’ป [Adding a benchmark] | How to add a new benchmark to MTEB and to the leaderboard | +| ๐Ÿค [Contributing] | How to contribute to MTEB and set it up for development | +| ๐ŸŒ [MMTEB] | An open-source effort to extend MTEB to cover a broad set of languages | [Tasks]: docs/tasks.md [Benchmarks]: docs/benchmarks.md [Contributing]: CONTRIBUTING.md [Adding a model]: docs/adding_a_model.md [Adding a dataset]: docs/adding_a_dataset.md -[Adding a leaderboard tab]: docs/adding_a_leaderboard_tab.md +[Adding a benchmark]: docs/adding_a_benchmark.md [Leaderboard]: https://huggingface.co/spaces/mteb/leaderboard [MMTEB]: docs/mmteb/readme.md [Reproducible workflows]: docs/reproducible_workflow.md @@ -517,5 +517,6 @@ You may also want to read and cite the amazing work that has extended MTEB & int - Orion Weller, Benjamin Chang, Sean MacAvaney, Kyle Lo, Arman Cohan, Benjamin Van Durme, Dawn Lawrie, Luca Soldaini. "[FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions](https://arxiv.org/abs/2403.15246)" arXiv 2024 - Dawei Zhu, Liang Wang, Nan Yang, Yifan Song, Wenhao Wu, Furu Wei, Sujian Li. "[LongEmbed: Extending Embedding Models for Long Context Retrieval](https://arxiv.org/abs/2404.12096)" arXiv 2024 - Kenneth Enevoldsen, Mรกrton Kardos, Niklas Muennighoff, Kristoffer Laigaard Nielbo. "[The Scandinavian Embedding Benchmarks: Comprehensive Assessment of Multilingual and Monolingual Text Embedding](https://arxiv.org/abs/2406.02396)" arXiv 2024 +- Ali Shiraee Kasmaee, Mohammad Khodadad, Mohammad Arshi Saloot, Nick Sherck, Stephen Dokas, Hamidreza Mahyar, Soheila Samiee. "[ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance & Efficiency on a Specific Domain](https://arxiv.org/abs/2412.00532)" arXiv 2024 For works that have used MTEB for benchmarking, you can find them on the [leaderboard](https://huggingface.co/spaces/mteb/leaderboard). diff --git a/docs/adding_a_benchmark.md b/docs/adding_a_benchmark.md new file mode 100644 index 0000000000..56a042fdb9 --- /dev/null +++ b/docs/adding_a_benchmark.md @@ -0,0 +1,7 @@ +## Adding a benchmark + +The MTEB Leaderboard is available [here](https://huggingface.co/spaces/mteb/leaderboard) and we encourage additions of new benchmarks. To add a new benchmark: + +1. Add your benchmark to [benchmark.py](../mteb/benchmarks/benchmarks.py) as a `Benchmark` object, and select the MTEB tasks that will be in the benchmark. If some of the tasks do not exist in MTEB, follow the "add a dataset" instructions to add them. +2. Open a PR at https://github.com/embedding-benchmark/results with results of models on your benchmark. +3. When PRs are merged, your benchmark will be added to the leaderboard automatically after the next workflow trigger. \ No newline at end of file diff --git a/docs/adding_a_leaderboard_tab.md b/docs/adding_a_leaderboard_tab.md deleted file mode 100644 index 260293ed5c..0000000000 --- a/docs/adding_a_leaderboard_tab.md +++ /dev/null @@ -1,15 +0,0 @@ -## Adding a new Leaderboard tab - -The MTEB Leaderboard is available [here](https://huggingface.co/spaces/mteb/leaderboard) and we love new leaderboard tabs. To add a new leaderboard tab: - -1. Open a PR in https://hf.co/datasets/mteb/results with: -- All results added in existing model folders or new folders -- Updated paths.json (see snippet results.py) -- If adding any new models, their names added to results.py -- If you have access to all models you are adding, you can also [add results via the metadata](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md) for all of them / some of them -2. Open a PR at https://huggingface.co/spaces/mteb/leaderboard modifying app.py to add your tab: -- Add any new models & their specs to the global lists -- Add your tab, credits etc to where the other tabs are defined -- If you're adding new results to existing models, remove those models from `EXTERNAL_MODEL_RESULTS.json` such that they can be reloaded with the new results and are not cached. -- You may also have to uncomment `, download_mode='force_redownload', verification_mode="no_checks")` where the datasets are loaded to experiment locally without caching of results -- Test that it runs & works locally as you desire with python app.py, **please add screenshots to the PR** diff --git a/docs/adding_a_model.md b/docs/adding_a_model.md index f87d723934..088199e264 100644 --- a/docs/adding_a_model.md +++ b/docs/adding_a_model.md @@ -2,7 +2,63 @@ The MTEB Leaderboard is available [here](https://huggingface.co/spaces/mteb/leaderboard). To submit to it: -1. **Run the desired model on MTEB:** +1. **Add meta information about your model to [model dir](../mteb/models/)**. + ```python + from mteb.model_meta import ModelMeta + + bge_m3 = ModelMeta( + name="model_name", + languages=["model_languages"], # in format eng-Latn + open_weights=True, + revision="5617a9f61b028005a4858fdac845db406aefb181", + release_date="2024-06-28", + n_parameters=568_000_000, + embed_dim=4096, + license="mit", + max_tokens=8194, + reference="https://huggingface.co/BAAI/bge-m3", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", + training_datasets={"your_dataset": ["train"]}, + ) + ``` + By default, the model will run using the [`sentence_transformers_loader`](../mteb/models/sentence_transformer_wrapper.py) loader function. If you need to use a custom implementation, you can specify the `loader` parameter in the `ModelMeta` class. For example: + ```python + from mteb.models.wrapper import Wrapper + from mteb.encoder_interface import PromptType + import numpy as np + + class CustomWrapper(Wrapper): + def __init__(self, model_name, model_revision): + super().__init__(model_name, model_revision) + # your custom implementation here + + def encode( + self, + sentences: list[str], + *, + task_name: str, + prompt_type: PromptType | None = None, + **kwargs + ) -> np.ndarray: + # your custom implementation here + return np.zeros((len(sentences), self.embed_dim)) + ``` + Then you can specify the `loader` parameter in the `ModelMeta` class: + ```python + your_model = ModelMeta( + loader=partial( + CustomWrapper, + model_name="model_name", + model_revision="5617a9f61b028005a4858fdac845db406aefb181" + ), + ... + ) + ``` +2. **Run the desired model on MTEB:** Either use the Python API: @@ -28,64 +84,51 @@ mteb run -m {model_name} -t {task_names} These will save the results in a folder called `results/{model_name}/{model_revision}`. -<<<<<<< HEAD -1. **Format the results using the CLI:** -======= 2. **Push Results to the Leaderboard** To add results to the public leaderboard you can push your results to the [results repository](https://github.com/embeddings-benchmark/results) via a PR. Once merged they will appear on the leaderboard after a day. +3. **Wait for a refresh the leaderboard** -3. (Optional) **Add results to the model card:** - -`mteb` implements a cli for adding results to the model card: ->>>>>>> main - -```bash -mteb create_meta --results_folder results/{model_name}/{model_revision} --output_path model_card.md -``` - -To add the content to the public model simply copy the content of the `model_card.md` file to the top of a `README.md` file of your model on the Hub. See [here](https://huggingface.co/Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit/blob/main/README.md) for an example. - -If the readme already exists: - -```bash -mteb create_meta --results_folder results/{model_name}/{model_revision} --output_path model_card.md --from_existing your_existing_readme.md -``` - -<<<<<<< HEAD -2. **Add the frontmatter to model repository:** +**Notes:** -Copy the content of the `model_card.md` file to the top of a `README.md` file of your model on the Hub. See [here](https://huggingface.co/Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit/blob/main/README.md) for an example. -======= -Note that running the model on many tasks may lead to a huge readme front matter. ->>>>>>> main +##### Using Prompts with Sentence Transformers -3. **Wait for a refresh the leaderboard:** +If your model uses Sentence Transformers and requires different prompts for encoding the queries and corpus, you can take advantage of the `prompts` [parameter](https://sbert.net/docs/package_reference/sentence_transformer/SentenceTransformer.html#sentence_transformers.SentenceTransformer). -The leaderboard [automatically refreshes daily](https://github.com/embeddings-benchmark/leaderboard/commits/main/) so once submitted you only need to wait for the automatic refresh. You can find the workflows for the leaderboard refresh [here](https://github.com/embeddings-benchmark/leaderboard/tree/main/.github/workflows). If you experience issues with the leaderboard please create an [issue](https://github.com/embeddings-benchmark/mteb/issues). +Internally, `mteb` uses `query` for encoding the queries and `passage` as the prompt names for encoding the corpus. This is aligned with the default names used by Sentence Transformers. -**Notes:** -- We remove models with scores that cannot be reproduced, so please ensure that your model is accessible and scores can be reproduced. -<<<<<<< HEAD -- An alternative way of submitting to the leaderboard is by opening a PR with your results [here](https://github.com/embeddings-benchmark/results) & checking that they are displayed correctly by [locally running the leaderboard](https://github.com/embeddings-benchmark/leaderboard?tab=readme-ov-file#developer-setup) -======= ->>>>>>> main +###### Adding the prompts in the model configuration (Preferred) -- ##### Using Prompts with Sentence Transformers - - If your model uses Sentence Transformers and requires different prompts for encoding the queries and corpus, you can take advantage of the `prompts` [parameter](https://sbert.net/docs/package_reference/sentence_transformer/SentenceTransformer.html#sentence_transformers.SentenceTransformer). - - Internally, `mteb` uses the prompt named `query` for encoding the queries and `passage` as the prompt name for encoding the corpus. This is aligned with the default names used by Sentence Transformers. +You can directly add the prompts when saving and uploading your model to the Hub. For an example, refer to this [configuration file](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5/blob/3b5a16eaf17e47bd997da998988dce5877a57092/config_sentence_transformers.json). These prompts can then be specified in the ModelMeta object. - ###### Adding the prompts in the model configuration (Preferred) - You can directly add the prompts when saving and uploading your model to the Hub. For an example, refer to this [configuration file](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5/blob/3b5a16eaf17e47bd997da998988dce5877a57092/config_sentence_transformers.json). +```python +model = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="intfloat/multilingual-e5-small", + revision="fd1525a9fd15316a2d503bf26ab031a61d056e98", + model_prompts={ + "query": "query: ", + "passage": "passage: ", + }, + ), +) +``` +If you are unable to directly add the prompts in the model configuration, you can instantiate the model using the `sentence_transformers_loader` and pass `prompts` as an argument. For more details, see the `mteb/models/bge_models.py` file. - ###### Instantiating the Model with Prompts +##### Adding instruction models -<<<<<<< HEAD - If you are unable to directly add the prompts in the model configuration, you can instantiate the model using the `sentence_transformers_loader` and pass `prompts` as an argument. For more details, see the `mteb/models/bge_models.py` file. -======= - If you are unable to directly add the prompts in the model configuration, you can instantiate the model using the `sentence_transformers_loader` and pass `prompts` as an argument. For more details, see the `mteb/models/bge_models.py` file. ->>>>>>> main +Models that use instructions can use the [`InstructSentenceTransformerWrapper`](../mteb/models/instruct_wrapper.py). For example: +```python +model = ModelMeta( + loader=partial( + InstructSentenceTransformerWrapper, + model="nvidia/NV-Embed-v1", + revision="7604d305b621f14095a1aa23d351674c2859553a", + instruction_template="Instruct: {instruction}\nQuery: ", + ), + ... +) +``` diff --git a/docs/benchmarks.md b/docs/benchmarks.md index a5abe50215..7c0f07d878 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -7,16 +7,27 @@ The following table gives you an overview of the benchmarks in MTEB. | Name | # Tasks | Task Types | Domains | Languages | |------|---------|------------|---------|-----------| -| [CoIR](https://github.com/CoIR-team/coir) | 10 | {'Retrieval': 10} | [Written, Programming] | python,c++,sql,go,eng,php,javascript,ruby,java | -| [MINERSBitextMining](https://arxiv.org/pdf/2406.07424) | 7 | {'BitextMining': 7} | [Written, Social, Reviews] | sun,kaz,tzl,ido,abs,arq,yue,tam,nij,glg,slk,hsb,ber,xho,cbk,pol,uzb,ina,kab,swh,amh,fao,kzj,lfn,uig,sqi,deu,ang,ind,bug,pms,ibo,cym,eus,spa,ceb,tgl,ron,isl,ita,csb,cha,fin,est,pes,jpn,tel,tha,oci,cmn,min,fry,bbc,epo,lit,rus,bos,hrv,war,ara,bjn,mkd,srp,ast,nno,urd,pam,aze,eng,ace,bew,kor,dan,awa,mui,hye,ban,cor,ben,gle,swe,mad,bul,lat,cat,nob,fra,pcm,ell,mar,vie,tat,ukr,gsw,kat,arz,dsb,lvs,nld,tur,bel,max,nds,afr,khm,dtp,yor,ces,gla,zsm,mak,ile,nov,orv,bre,swg,rej,mhr,mon,mal,jav,heb,slv,bhp,kur,wuu,tuk,por,hun,hin,hau,yid | +| [BRIGHT](https://brightbenchmark.github.io/) | 1 | {'Retrieval': 1} | [Non-fiction] | eng | +| [ChemTEB](https://arxiv.org/abs/2412.00532) | 27 | {'BitextMining': 1, 'Classification': 17, 'Clustering': 2, 'PairClassification': 5, 'Retrieval': 2} | [Chemistry] | nld,tur,eng,ces,kor,zho,spa,hin,jpn,deu,fra,msa,por | +| [CoIR](https://github.com/CoIR-team/coir) | 10 | {'Retrieval': 10} | [Written, Programming] | javascript,ruby,sql,go,eng,java,php,python,c++ | +| [LongEmbed](https://arxiv.org/abs/2404.12096v2) | 6 | {'Retrieval': 6} | [Fiction, Academic, Written, Blog, Non-fiction, Spoken, Encyclopaedic] | eng | +| [MINERSBitextMining](https://arxiv.org/pdf/2406.07424) | 7 | {'BitextMining': 7} | [Reviews, Written, Social] | sqi,ban,srp,jpn,nds,lat,por,mon,kur,bul,slv,mak,deu,uzb,yor,kzj,max,kat,cha,yid,zsm,spa,pms,mhr,min,fao,heb,nij,mui,tuk,rus,bew,swe,pes,slk,ceb,bjn,ido,abs,ukr,ina,kab,tgl,cor,dan,kaz,fry,rej,hrv,ces,lfn,glg,dsb,hau,ace,urd,ben,yue,nld,eng,epo,ron,xho,wuu,cmn,ind,ang,hsb,mad,pam,nov,swh,bbc,pcm,ara,hye,mkd,nno,ast,jav,lvs,mal,swg,nob,tat,arz,vie,ile,tam,est,ber,bre,csb,pol,afr,cbk,bug,tzl,kor,ibo,hun,war,aze,tha,mar,uig,gla,orv,hin,amh,bel,sun,fin,cat,awa,gsw,isl,oci,ell,cym,arq,ita,fra,bos,dtp,eus,bhp,tel,tur,khm,lit,gle | +| MTEB(Europe, beta) | 74 | {'BitextMining': 7, 'Classification': 21, 'Clustering': 8, 'Retrieval': 15, 'InstructionRetrieval': 3, 'MultilabelClassification': 2, 'PairClassification': 6, 'Reranking': 3, 'STS': 9} | [Web, Fiction, Social, Academic, Religious, Written, Medical, Blog, Constructed, Non-fiction, Legal, News, Government, Reviews, Spoken, Encyclopaedic, Programming, Subtitles] | qvm,esk,nlg,toj,gup,llg,jpn,azj,for,lav,kmh,por,bsj,tna,upv,cta,smk,zty,qvz,ntj,ton,uvh,cjk,kgf,gaw,bak,seh,jiv,hui,ksr,uli,kwi,qvw,kkl,arl,msk,omw,aai,tet,yby,mva,fao,kgk,min,kac,dji,box,rus,chz,emp,ktm,bps,bon,nus,bss,cut,sue,meq,kpr,rwo,ceb,zaj,mib,aui,apc,kdl,mxb,okv,rai,big,reg,ulk,mlg,yap,tpt,hrv,nak,plu,nde,kyc,arp,hau,ary,alp,apr,caa,mbh,uvl,zat,bjp,urd,bki,lin,mek,hlt,iws,spl,xav,yml,lcm,ese,xho,are,mux,lww,ndg,ntu,tzj,ame,yss,zar,fil,aii,csy,gvs,zpm,amh,spp,ken,avt,ltz,swh,viv,kmk,zul,bqp,cav,wln,leu,tcs,tuf,mkd,clu,msy,too,ast,amx,quf,jav,yre,nhe,tat,lbk,maj,msm,rug,nor,tbc,prf,pad,zlm,kze,wnc,fai,cbs,mai,aoi,mxq,bao,kos,mlh,nep,mkl,roo,umb,poh,bod,nna,aey,afr,aly,cac,maa,aze,fon,tha,mhl,chd,tpi,tzm,acq,kyz,nbq,yle,ape,bco,att,nin,mkj,yuj,ata,djr,atb,enq,cpb,sxb,rmc,zas,guj,kbq,gfk,tgo,acm,cux,fin,npi,etr,tsn,dob,mpt,alq,byx,cak,cso,spy,oci,asm,ttc,nwi,srn,hmn,gyr,hto,ngu,cpa,tif,fuv,kue,yuw,ote,mgw,ssg,bos,mvn,dop,aso,mox,ndj,stp,mpp,nas,kon,mks,caf,mbs,mcd,wap,cco,tod,aon,aom,cnl,srp,zga,lat,sja,kpj,nhi,nko,swp,bho,blw,mih,mon,sna,bgs,als,kyf,kur,bul,uzb,knj,mam,yor,zos,gdr,aka,bam,bmh,gnw,lid,cha,msc,zpl,gun,qxn,zsm,spa,mgh,nca,cpc,quc,hvn,bvr,agu,ngp,aak,jni,mau,sab,wos,huv,swe,kea,tum,pes,som,pbt,mmo,amo,kgp,taq,sbe,mil,nhg,bmu,bvd,wrs,atg,muy,tpa,ign,vmy,uri,chf,cek,knf,pib,soy,boa,ces,xed,pma,hix,kbc,orm,sim,ace,nhw,kud,ppo,xnn,yut,snx,ilo,zaa,nld,bsp,aau,myk,grn,bkq,cme,bbb,ssd,fur,knc,knv,heg,urw,ayr,ons,sat,crx,rop,szl,suz,ncl,anh,kto,tca,chk,xla,qxh,ziw,ntp,azb,ara,tew,sot,cjv,djk,usa,ltg,cap,arz,lmo,vec,jao,wer,dhg,vie,ded,hop,khk,faa,tam,sus,mwc,ikk,kek,mie,trc,tue,ura,crh,bkd,bzj,kwj,klt,sps,jid,xsi,swa,qxo,lim,nqo,hns,tmd,mbt,mbc,ibo,hun,wrk,bnp,abt,kaq,car,kiz,nvm,nfa,gul,guo,uzn,beo,aer,nhy,otm,cjo,tgk,bel,eri,mca,wsk,rro,row,bsn,tpz,fij,tvk,msb,mpx,abx,poy,sgb,kas,tcz,top,dif,awk,cbc,bea,ell,myy,pus,bmr,ssx,pao,ebk,ajp,opm,wnu,gub,acr,tbf,ubr,cth,taj,aby,kde,mqj,zao,khm,hat,gle,azg,cbv,ian,apu,ptp,kbm,met,plt,sag,agd,pag,ydd,ckb,mzz,div,kmg,miz,tac,tuo,gvn,boj,tee,mph,mna,qwh,gng,agg,mle,rgu,haw,med,kyg,mig,nhu,tnc,waj,kat,lua,zpz,kpx,tof,ven,dzo,yaa,bqc,klv,qul,kqw,bef,gai,heb,nuy,zac,mcr,zpc,ssw,meu,tuk,gui,kmo,usp,otq,khs,ksj,xbi,nya,cya,aoj,kmr,grc,sny,snp,mir,piu,geb,tgl,dik,agn,dan,qvn,kaz,kbp,mto,tiy,xon,zav,dww,zap,kqa,lac,kne,wat,cbt,naf,inb,kwf,crn,azz,wim,ben,wro,poi,yue,awb,cgc,eng,mjc,amf,mps,mwe,ncu,cle,tdt,hne,zai,gdn,toc,bhl,kir,ron,fue,kyq,ixl,ghs,ncj,tbz,nnq,mio,kwd,mxp,beu,sbk,fuh,gym,ztq,mey,ikw,pab,kmb,cof,tso,ipi,byr,aia,wiv,agm,npl,ter,hye,iou,tku,nno,cnt,kqc,sll,lvs,gnn,nob,dah,nii,san,wuv,udu,gux,ots,zpq,cuk,mbj,nab,bjz,hbo,imo,mcf,glk,zam,twi,srd,sin,zca,qvc,agr,con,kjs,zaw,mav,gum,dov,ood,soq,tte,msa,chq,cbk,isn,kpf,ptu,mri,cao,aeb,cni,aaz,yon,pan,sgz,rom,mop,gwi,nou,uig,gla,far,atd,hin,tnp,bbr,kpg,huu,arn,jvn,cat,awa,amm,urb,run,mit,pir,gam,adz,tir,isl,pls,mlt,qve,nyu,txu,tbg,dwy,quy,ruf,kiw,shp,amr,ita,maq,dgr,fra,kin,ubu,gof,gaz,mgc,cmo,ctu,tel,eus,mcq,bpr,ino,snd,bgt,mwf,acu,jic,kkc,jac,lit,xtd,dyu,kvn,zyp,prs,cop,auc,wed,apb,sqi,ban,wal,poe,tnk,myu,otn,kje,ong,bkx,zsr,hch,agt,wiu,spm,zpu,scn,sri,myw,buk,kdc,zho,sbs,slv,deu,kqf,kvg,tgp,bhg,dwr,xtm,amu,wbp,tim,ory,tos,kan,kbh,mya,mwp,mcb,shn,bdd,cub,yrb,tbo,yal,lug,tah,txq,emi,hub,nso,slk,zpo,zpv,bmk,nss,bjn,nch,bzd,shj,ukr,mbl,tlf,kab,kew,kpw,luo,cpy,kmu,kup,zab,pri,snc,wbi,acf,gmv,glg,amp,qup,nop,srq,yka,apw,mqb,wmt,bch,ewe,sey,lbb,epo,qvh,taw,fuc,kql,ksd,smo,gvf,cmn,yad,ind,qvs,obo,wmw,nsn,anv,mic,pap,ake,fas,cbr,bjr,glv,mdy,tsw,gvc,noa,bus,bjv,cwe,pon,pio,snn,mal,nho,bba,jae,mxt,wol,nif,ycn,lao,tfr,ffm,qub,hus,bzh,mlp,mti,not,nys,tzo,arb,mos,kam,cuc,dgc,pah,pjt,est,bxh,hot,bre,kms,cot,awx,bjk,pwg,cpu,hla,mpm,fuf,pol,tnn,shi,auy,mpj,tuc,bug,kor,zad,war,ars,rkb,mni,cbu,lif,mar,dad,mee,dgz,mco,kik,apz,mkn,sco,mbb,maz,lij,khz,hmo,guh,sun,cbi,lgl,nhr,tiw,daa,amn,amk,tke,lex,mag,cym,eko,zia,mcp,gah,urt,sua,cab,quh,srm,vid,blz,mmx,apn,tur,rmy,bem,yaq,ctp,cui,lus,tav,cax,yva | +| MTEB(Indic, beta) | 23 | {'BitextMining': 4, 'Clustering': 1, 'Classification': 13, 'STS': 1, 'PairClassification': 1, 'Retrieval': 2, 'Reranking': 1} | [Web, Fiction, Social, Encyclopaedic, Religious, Written, Constructed, Non-fiction, Legal, News, Spoken, Reviews, Government] | ban,pag,ckb,ydd,srp,azj,jpn,bho,por,sna,als,scn,cjk,zho,mwr,bul,slv,deu,yor,bak,ory,aka,bam,kat,lua,kan,dzo,mya,zsm,spa,shn,min,nus,fao,heb,kac,lug,tuk,kea,rus,ssw,tum,swe,nso,pes,slk,som,mup,pbt,nya,ceb,bjn,kmr,apc,taq,ukr,kab,luo,tgl,dik,dan,kaz,kbp,hrv,ces,glg,ary,hau,ace,urd,ben,boy,ewe,ilo,yue,lin,nld,eng,hne,epo,kir,grn,ron,xho,smo,fur,knc,cmn,ind,ayr,sat,szl,pap,fas,kmb,tso,ltz,swh,brx,zul,azb,doi,ara,hye,mkd,nno,ast,jav,lvs,mal,lao,sot,wol,nob,ltg,tat,san,arz,lmo,vec,nor,vie,sag,khk,arb,mos,kam,tam,bgc,mai,gbm,srd,est,twi,crh,sin,nep,swa,umb,bod,pol,lim,nqo,afr,bug,kor,ibo,mri,hun,aeb,war,ars,mni,fon,tha,mar,tpi,tzm,acq,pan,uzn,kik,gla,uig,hin,lij,tgk,amh,bel,sun,acm,guj,fin,cat,awa,fij,npi,run,tsn,kas,tir,isl,asm,mlt,ell,oci,mag,cym,pus,gom,quy,ajp,raj,fuv,ita,kin,bos,fra,gaz,eus,tel,tur,snd,kon,khm,bem,dyu,gle,hat,lit,prs,lus,plt | +| MTEB(Medical) | 12 | {'Retrieval': 9, 'Clustering': 2, 'Reranking': 1} | [Web, Academic, Medical, Written, Non-fiction, Government] | rus,eng,kor,ara,spa,zho,vie,fra,pol,cmn | +| MTEB(Multilingual, beta) | 132 | {'BitextMining': 13, 'Classification': 43, 'Clustering': 17, 'Retrieval': 18, 'InstructionRetrieval': 3, 'MultilabelClassification': 5, 'PairClassification': 11, 'Reranking': 6, 'STS': 16} | [Web, Fiction, Social, Academic, Religious, Written, Medical, Blog, Constructed, Non-fiction, Legal, Government, News, Reviews, Spoken, Encyclopaedic, Programming, Subtitles] | qvm,esk,nlg,toj,gup,llg,jpn,azj,for,lav,kmh,por,bsj,tna,upv,cta,smk,zty,qvz,ntj,ton,uvh,cjk,kgf,gaw,bak,seh,jiv,hui,ksr,uli,kwi,qvw,kkl,arl,msk,omw,aai,tet,yby,mva,fao,kgk,min,kac,dji,mui,box,rus,chz,emp,bew,ktm,bps,bon,nus,bss,cut,sue,meq,kpr,rwo,ceb,zaj,mib,aui,apc,kdl,mxb,okv,rai,big,reg,ulk,mlg,yap,tpt,rej,hrv,nak,plu,nde,lfn,kyc,arp,hau,ary,alp,apr,caa,mbh,uvl,zat,bjp,urd,bki,lin,mek,hlt,iws,spl,xav,yml,lcm,ese,xho,are,mux,lww,ndg,ntu,tzj,ame,yss,zar,fil,aii,csy,gvs,zpm,amh,spp,ken,avt,ltz,swh,viv,kmk,zul,bqp,cav,wln,leu,tcs,tuf,mkd,clu,msy,too,ast,amx,quf,jav,yre,nhe,tat,lbk,maj,msm,rug,nor,tbc,prf,pad,zlm,kze,wnc,fai,cbs,mai,aoi,mxq,bao,kos,mlh,nep,mkl,roo,umb,poh,bod,nna,aey,afr,aly,cac,maa,aze,fon,tha,mhl,chd,tpi,tzm,acq,kyz,nbq,yle,ape,bco,att,nin,mkj,yuj,ata,djr,atb,enq,cpb,sxb,rmc,zas,guj,kbq,gfk,tgo,acm,cux,fin,npi,etr,tsn,dob,mpt,alq,byx,cak,cso,spy,oci,asm,ttc,nwi,srn,hmn,gyr,hto,arq,ngu,cpa,tif,fuv,raj,kue,yuw,ote,mgw,ssg,bos,mvn,dop,aso,mox,ndj,stp,mpp,nas,kon,mks,caf,mbs,mcd,wap,cco,tod,aon,aom,cnl,srp,zga,lat,sja,kpj,nhi,nko,swp,bho,blw,mih,mon,sna,bgs,als,kyf,kur,bul,uzb,knj,mam,yor,zos,gdr,aka,bam,bmh,gnw,lid,cha,msc,zpl,gun,qxn,zsm,spa,mgh,nca,yid,pms,mhr,cpc,quc,hvn,bvr,agu,svk,ngp,aak,jni,mau,sab,wos,huv,swe,kea,tum,pes,som,mup,pbt,mmo,amo,kgp,ido,taq,sbe,mil,nhg,bmu,bvd,wrs,atg,muy,tpa,chv,ign,vmy,cor,uri,fry,chf,cek,knf,pib,soy,boa,ces,xed,pma,hix,kbc,orm,sim,ace,nhw,kud,ppo,xnn,yut,boy,snx,ilo,zaa,nld,bsp,aau,myk,grn,bkq,cme,bbb,ssd,fur,knc,wuu,knv,heg,urw,ayr,ons,sat,crx,ang,hsb,rop,szl,suz,mad,ncl,anh,kto,tca,chk,xla,qxh,brx,ziw,ntp,azb,ara,tew,sot,cjv,djk,usa,ltg,cap,arz,lmo,vec,jao,wer,dhg,vie,ded,hop,khk,faa,tam,bgc,sus,mwc,ikk,kek,mie,trc,tue,ura,crh,ber,bkd,bzj,kwj,klt,sps,jid,xsi,swa,qxo,csb,lim,nqo,hns,tmd,mbt,mbc,ibo,hun,wrk,bnp,abt,kaq,car,kiz,nvm,nfa,gul,guo,uzn,beo,aer,nhy,otm,orv,cjo,tgk,bel,eri,mca,wsk,rro,row,bsn,tpz,fij,tvk,msb,mpx,abx,poy,sgb,kas,tcz,top,dif,awk,cbc,bea,ell,myy,pus,bmr,ssx,pao,ebk,ajp,opm,wnu,gub,acr,max,tbf,ubr,cth,taj,aby,kde,mqj,zao,tyv,khm,hat,gle,azg,cbv,ian,apu,ptp,kbm,met,plt,sag,agd,sah,pag,ydd,ckb,mzz,div,kmg,miz,tac,tuo,gvn,boj,tee,mph,mna,qwh,gng,agg,mle,mak,rgu,haw,med,kyg,mig,nhu,tnc,waj,kat,lua,zpz,kpx,tof,ven,dzo,yaa,bqc,klv,qul,kqw,bef,gai,heb,nuy,zac,mcr,zpc,ssw,meu,tuk,gui,kmo,usp,otq,khs,ksj,xbi,nya,cya,aoj,kmr,grc,sny,snp,mir,piu,geb,tgl,dik,agn,dan,qvn,kaz,kbp,mto,tiy,xon,zav,dww,zap,kqa,lac,kne,wat,cbt,naf,inb,kwf,crn,azz,wim,ben,wro,poi,yue,awb,cgc,eng,mjc,amf,mps,mwe,ncu,cle,tdt,hne,zai,gdn,toc,bhl,kir,ron,fue,kyq,ixl,ghs,ncj,tbz,nnq,mio,kwd,mxp,beu,sbk,fuh,gym,ztq,mey,ikw,pab,pam,kmb,cof,tso,ipi,byr,aia,wiv,pcm,agm,doi,npl,ter,hye,iou,tku,nno,cnt,kqc,sll,lvs,gnn,nob,dah,nii,san,wuv,udu,gux,ots,zpq,cuk,mbj,nab,bjz,hbo,imo,mcf,glk,zam,twi,srd,sin,zca,qvc,agr,con,kjs,zaw,mav,gum,dov,ood,soq,tte,msa,chq,cbk,tzl,isn,kpf,ptu,mri,cao,aeb,cni,aaz,yon,pan,sgz,rom,mop,gwi,nou,uig,gla,far,atd,hin,tnp,bbr,kpg,huu,arn,jvn,cat,awa,amm,urb,run,mit,pir,gam,adz,tir,isl,pls,mlt,gsw,qve,nyu,txu,tbg,dwy,quy,ruf,kiw,shp,amr,ita,maq,dgr,fra,kin,ubu,gof,gaz,mgc,cmo,ctu,tel,eus,mcq,bpr,ino,snd,bgt,mwf,acu,jic,kkc,jac,lit,xtd,dyu,kvn,zyp,prs,cop,auc,wed,apb,sqi,ban,wal,poe,tnk,myu,otn,kje,ong,bkx,zsr,nds,hch,agt,wiu,spm,zpu,scn,sri,myw,buk,kdc,zho,sbs,slv,mwr,deu,kqf,kvg,tgp,bhg,dwr,xtm,amu,wbp,tim,ory,kzj,tos,kan,kbh,mya,mwp,mcb,shn,bdd,cub,yrb,tbo,yal,nij,lug,tah,txq,emi,hub,nso,slk,zpo,zpv,bmk,nss,bjn,nch,abs,bzd,shj,ukr,mbl,ina,tlf,kab,kew,kpw,luo,cpy,kmu,kup,zab,pri,snc,wbi,acf,gmv,glg,dsb,amp,qup,nop,srq,yka,apw,mqb,wmt,bch,ewe,sey,lbb,epo,qvh,taw,fuc,kql,ksd,smo,gvf,cmn,yad,ind,qvs,obo,wmw,nsn,anv,mic,pap,ake,nov,fas,cbr,bjr,glv,mdy,bbc,tsw,gvc,noa,bus,bjv,cwe,pon,pio,snn,swg,mal,nho,bba,jae,mxt,wol,nif,ycn,lao,tfr,ffm,qub,hus,bzh,mlp,mti,not,nys,ile,tzo,arb,mos,kam,cuc,dgc,pah,pjt,gbm,est,bxh,hot,bre,kms,cot,awx,bjk,pwg,cpu,hla,mpm,fuf,pol,tnn,shi,auy,mpj,tuc,bug,kor,zad,war,ars,rkb,mni,cbu,lif,mar,krc,dad,mee,dgz,mco,kik,apz,mkn,sco,mbb,maz,lij,khz,hmo,guh,sun,cbi,lgl,nhr,tiw,daa,amn,amk,tke,lex,mag,cym,gom,eko,zia,mcp,gah,urt,sua,cab,quh,srm,dtp,vid,blz,bhp,mmx,apn,tur,rmy,bem,yaq,ctp,cui,lus,tav,cax,yva | | [MTEB(Retrieval w/Instructions)](https://arxiv.org/abs/2403.15246) | 3 | {'InstructionRetrieval': 3} | [Written, News] | eng | -| [MTEB(Scandinavian)](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/) | 28 | {'BitextMining': 2, 'Classification': 13, 'Retrieval': 7, 'Clustering': 6} | [Encyclopaedic, Spoken, Non-fiction, Government, News, Fiction, Social, Blog, Reviews, Written, Web, Legal] | nob,fao,swe,isl,dan,nno | -| MTEB(code) | 12 | {'Retrieval': 12} | [Written, Programming] | python,c++,sql,c,go,eng,shell,typescript,php,scala,rust,swift,javascript,ruby,java | -| [MTEB(deu)](https://arxiv.org/html/2401.02709v1) | 19 | {'Classification': 6, 'Clustering': 4, 'PairClassification': 2, 'Reranking': 1, 'Retrieval': 4, 'STS': 2} | [Encyclopaedic, Spoken, News, Reviews, Written, Web] | eng,deu,pol,fra | -| MTEB(eng) | 67 | {'Classification': 12, 'Retrieval': 26, 'Clustering': 11, 'Reranking': 4, 'STS': 10, 'PairClassification': 3, 'Summarization': 1} | [Encyclopaedic, Spoken, Non-fiction, Blog, News, Medical, Social, Programming, Written, Reviews, Web, Academic] | tur,fra,eng,cmn,pol,ita,nld,spa,deu,ara | -| [MTEB(fra)](https://arxiv.org/abs/2405.20468) | 26 | {'Classification': 6, 'Clustering': 7, 'PairClassification': 2, 'Reranking': 2, 'Retrieval': 5, 'STS': 3, 'Summarization': 1} | [Encyclopaedic, Spoken, Non-fiction, News, Social, Reviews, Written, Web, Legal, Academic] | eng,deu,pol,fra | -| MTEB(kor) | 6 | {'Classification': 1, 'Reranking': 1, 'Retrieval': 2, 'STS': 2} | [Encyclopaedic, Spoken, News, Reviews, Written, Web] | kor | -| [MTEB(law)](https://aclanthology.org/2023.eacl-main.148/) | 8 | {'Retrieval': 8} | [Written, Legal] | eng,deu,zho | -| [MTEB(pol)](https://arxiv.org/abs/2405.10138) | 18 | {'Classification': 7, 'Clustering': 3, 'PairClassification': 4, 'STS': 4} | [Spoken, Non-fiction, News, Fiction, Social, Written, Web, Legal, Academic] | pol,deu,eng,fra | -| [MTEB(rus)](https://aclanthology.org/2023.eacl-main.148/) | 23 | {'Classification': 9, 'Clustering': 3, 'MultilabelClassification': 2, 'PairClassification': 1, 'Reranking': 2, 'Retrieval': 3, 'STS': 3} | [Encyclopaedic, Spoken, Blog, News, Social, Reviews, Written, Web, Academic] | rus | +| [MTEB(Scandinavian)](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/) | 28 | {'BitextMining': 2, 'Classification': 13, 'Retrieval': 7, 'Clustering': 6} | [Web, Fiction, Social, Written, Blog, Non-fiction, Legal, News, Spoken, Reviews, Government, Encyclopaedic] | swe,nno,isl,dan,fao,nob | +| MTEB(code) | 12 | {'Retrieval': 12} | [Written, Programming] | javascript,ruby,sql,go,c,eng,shell,typescript,rust,java,php,python,scala,swift,c++ | +| [MTEB(deu)](https://arxiv.org/html/2401.02709v1) | 19 | {'Classification': 6, 'Clustering': 4, 'PairClassification': 2, 'Reranking': 1, 'Retrieval': 4, 'STS': 2} | [Web, Written, News, Spoken, Reviews, Encyclopaedic] | pol,deu,fra,eng | +| MTEB(eng, beta) | 41 | {'Classification': 8, 'Retrieval': 10, 'Clustering': 8, 'Reranking': 2, 'STS': 9, 'PairClassification': 3, 'Summarization': 1} | [Web, Academic, Social, Written, Medical, Blog, Non-fiction, News, Spoken, Reviews, Encyclopaedic, Programming] | nld,tur,eng,ara,spa,ita,deu,fra,pol,cmn | +| MTEB(eng, classic) | 67 | {'Classification': 12, 'Retrieval': 26, 'Clustering': 11, 'Reranking': 4, 'STS': 10, 'PairClassification': 3, 'Summarization': 1} | [Web, Academic, Social, Written, Medical, Blog, Non-fiction, News, Spoken, Reviews, Encyclopaedic, Programming] | nld,tur,eng,ara,spa,ita,deu,fra,pol,cmn | +| [MTEB(fra)](https://arxiv.org/abs/2405.20468) | 26 | {'Classification': 6, 'Clustering': 7, 'PairClassification': 2, 'Reranking': 2, 'Retrieval': 5, 'STS': 3, 'Summarization': 1} | [Web, Academic, Social, Written, Non-fiction, Legal, News, Spoken, Reviews, Encyclopaedic] | pol,deu,fra,eng | +| [MTEB(jpn)](https://github.com/sbintuitions/JMTEB) | 16 | {'Clustering': 2, 'Classification': 4, 'STS': 2, 'PairClassification': 1, 'Retrieval': 6, 'Reranking': 1} | [Web, Academic, Written, Non-fiction, News, Spoken, Reviews, Encyclopaedic] | jpn | +| MTEB(kor) | 6 | {'Classification': 1, 'Reranking': 1, 'Retrieval': 2, 'STS': 2} | [Web, Written, News, Spoken, Reviews, Encyclopaedic] | kor | +| [MTEB(law)](https://aclanthology.org/2023.eacl-main.148/) | 8 | {'Retrieval': 8} | [Written, Legal] | deu,zho,eng | +| [MTEB(pol)](https://arxiv.org/abs/2405.10138) | 18 | {'Classification': 7, 'Clustering': 3, 'PairClassification': 4, 'STS': 4} | [Web, Fiction, Academic, Social, Written, Non-fiction, Legal, News, Spoken] | pol,deu,fra,eng | +| [MTEB(rus)](https://aclanthology.org/2023.eacl-main.148/) | 23 | {'Classification': 9, 'Clustering': 3, 'MultilabelClassification': 2, 'PairClassification': 1, 'Reranking': 2, 'Retrieval': 3, 'STS': 3} | [Web, Social, Academic, Written, Blog, News, Spoken, Reviews, Encyclopaedic] | rus | +| [NanoBEIR](https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6) | 13 | {'Retrieval': 13} | [Web, Academic, Social, Medical, Written, Non-fiction, News, Encyclopaedic] | eng | +| [RAR-b](https://arxiv.org/abs/2404.06347) | 17 | {'Retrieval': 17} | [Encyclopaedic, Written, Programming] | eng | \ No newline at end of file diff --git a/docs/create_tasks_table.py b/docs/create_tasks_table.py index 4a1be0cd89..33dca958cb 100644 --- a/docs/create_tasks_table.py +++ b/docs/create_tasks_table.py @@ -50,7 +50,9 @@ def task_to_markdown_row(task: mteb.AbsTask) -> str: f"[{name}]({task.metadata.reference})" if task.metadata.reference else name ) domains = ( - "[" + ", ".join(task.metadata.domains) + "]" if task.metadata.domains else "" + "[" + ", ".join(sorted(task.metadata.domains)) + "]" + if task.metadata.domains + else "" ) n_samples = task.metadata.n_samples dataset_statistics = round_floats_in_dict(task.metadata.descriptive_stats) diff --git a/docs/mmteb/points_table.md b/docs/mmteb/points_table.md index 85978dcc00..dfb4a6b31c 100644 --- a/docs/mmteb/points_table.md +++ b/docs/mmteb/points_table.md @@ -2,206 +2,103 @@ _Note_: this table is **autogenerated** and should not be edited. It is intended to get an overview of contributions. -<<<<<<< HEAD - | GitHub | Paper writing | New dataset | Review PR | Bug fixes | Coordination | Dataset annotations | New task | Running Models | Total | -|:------------------|----------------:|--------------:|------------:|------------:|---------------:|----------------------:|-----------:|-----------------:|--------:| -| KennethEnevoldsen | 0 | 68 | 326 | 87 | 81 | 35 | 0 | 0 | 597 | -| isaac-chung | 12 | 120 | 194 | 50 | 54 | 1 | 2 | 0 | 433 | -| imenelydiaker | 0 | 120 | 144 | 24 | 70 | 0 | 0 | 0 | 358 | -| awinml | 0 | 300 | 2 | 0 | 0 | 0 | 0 | 0 | 302 | -| x-tabdeveloping | 0 | 144 | 32 | 10 | 41 | 0 | 12 | 0 | 239 | -| davidstap | 0 | 176 | 0 | 0 | 0 | 0 | 0 | 0 | 176 | -| jaygala24 | 0 | 149 | 0 | 0 | 0 | 0 | 0 | 0 | 149 | -| wissam-sib | 0 | 134 | 6 | 4 | 0 | 0 | 0 | 0 | 144 | -| Muennighoff | 0 | 0 | 48 | 0 | 70 | 0 | 0 | 24 | 142 | -| orionw | 0 | 0 | 20 | 20 | 75 | 0 | 10 | 0 | 125 | -| dokato | 0 | 94 | 6 | 12 | 0 | 0 | 0 | 0 | 112 | -| gentaiscool | 0 | 110 | 0 | 0 | 0 | 0 | 0 | 0 | 110 | -| jupyterjazz | 0 | 108 | 0 | 0 | 0 | 0 | 0 | 0 | 108 | -| SaitejaUtpala | 0 | 102 | 0 | 0 | 0 | 0 | 0 | 0 | 102 | -| vaibhavad | 0 | 6 | 4 | 8 | 75 | 0 | 0 | 0 | 93 | -| schmarion | 0 | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | -| MathieuCiancone | 0 | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | -| GabrielSequeira | 0 | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | -| digantamisra98 | 0 | 71 | 0 | 0 | 0 | 0 | 0 | 0 | 71 | -| shreeya-dhakal | 0 | 54 | 8 | 0 | 0 | 0 | 0 | 0 | 62 | -| Rysias | 0 | 58 | 0 | 0 | 0 | 0 | 0 | 0 | 58 | -| Samoed | 0 | 18 | 2 | 22 | 0 | 0 | 0 | 9 | 51 | -| sivareddyg | 0 | 0 | 0 | 0 | 50 | 0 | 0 | 0 | 50 | -| gowitheflow-1998 | 0 | 50 | 0 | 0 | 0 | 0 | 0 | 0 | 50 | -| asparius | 0 | 34 | 14 | 0 | 0 | 0 | 0 | 0 | 48 | -| Akash190104 | 0 | 46 | 0 | 0 | 0 | 0 | 0 | 0 | 46 | -| MartinBernstorff | 0 | 2 | 8 | 13 | 20 | 0 | 0 | 0 | 43 | -| akshita-sukhlecha | 0 | 36 | 0 | 4 | 0 | 0 | 0 | 0 | 40 | -| staoxiao | 0 | 40 | 0 | 0 | 0 | 0 | 0 | 0 | 40 | -| bp-high | 0 | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 36 | -| rafalposwiata | 0 | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 36 | -| KranthiGV | 0 | 20 | 14 | 0 | 0 | 0 | 0 | 0 | 34 | -| loicmagne | 0 | 0 | 0 | 28 | 0 | 0 | 0 | 0 | 28 | -| ShawonAshraf | 0 | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| bjoernpl | 0 | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| jphme | 0 | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| rasdani | 0 | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| violenil | 0 | 26 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | -| mariyahendriksen | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 24 | -| dwzhu-pku | 0 | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 24 | -| hgissbkh | 3 | 0 | 2 | 13 | 0 | 0 | 5 | 0 | 23 | -| taeminlee | 0 | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | -| kwojtasi | 0 | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | -| jankounchained | 0 | 14 | 0 | 8 | 0 | 0 | 0 | 0 | 22 | -| tomaarsen | 0 | 0 | 2 | 0 | 20 | 0 | 0 | 0 | 22 | -| crystina-z | 0 | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | -| mrshu | 0 | 16 | 4 | 0 | 0 | 1 | 0 | 0 | 21 | -| john-b-yang | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | -| rbroc | 0 | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | -| mmhamdy | 0 | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | -| ManuelFay | 0 | 2 | 0 | 13 | 0 | 0 | 5 | 0 | 20 | -| AlexeyVatolin | 0 | 0 | 0 | 20 | 0 | 0 | 0 | 0 | 20 | -| Andrian0s | 0 | 14 | 4 | 2 | 0 | 0 | 0 | 0 | 20 | -| thakur-nandan | 0 | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | -| manandey | 0 | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | -| PranjalChitale | 0 | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | -| dipam7 | 0 | 14 | 2 | 0 | 0 | 0 | 0 | 0 | 16 | -| sted97 | 0 | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | -| Sakshamrzt | 0 | 12 | 4 | 0 | 0 | 0 | 0 | 0 | 16 | -| taidnguyen | 0 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | -| artemsnegirev | 0 | 12 | 0 | 0 | 0 | 2 | 0 | 0 | 14 | -| slvnwhrl | 0 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| anpalmak2003 | 0 | 9 | 0 | 0 | 0 | 3 | 0 | 0 | 12 | -| Art3mis07 | 0 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| guenthermi | 0 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| jordiclive | 0 | 2 | 0 | 10 | 0 | 0 | 0 | 0 | 12 | -| xhluca | 0 | 6 | 2 | 4 | 0 | 0 | 0 | 0 | 12 | -| henilp105 | 0 | 0 | 0 | 2 | 0 | 9 | 0 | 0 | 11 | -| MariyaTikhonova | 0 | 7 | 0 | 0 | 0 | 4 | 0 | 0 | 11 | -| ab1992ao | 0 | 8 | 0 | 0 | 0 | 3 | 0 | 0 | 11 | -| tmp_handle | 0 | 0 | 0 | 0 | 10 | 0 | 0 | 0 | 10 | -| swj0419 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| Ruqyai | 0 | 2 | 8 | 0 | 0 | 0 | 0 | 0 | 10 | -| ZhengLiu101 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| Alenush | 0 | 6 | 0 | 0 | 0 | 4 | 0 | 0 | 10 | -| ABorghini | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| simon-clematide | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| sarahooker | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| guangyusong | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| HLasse | 0 | 0 | 0 | 5 | 0 | 5 | 0 | 0 | 10 | -| cassanof | 0 | 8 | 0 | 1 | 0 | 0 | 0 | 1 | 10 | -| hongjin-su | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| xiamengzhou | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| xu3kev | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| howard-yen | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| malteos | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| ljvmiranda921 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| marcobellagente93 | 0 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| izhx | 0 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| MexicanLemonade | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| antoniolanza1996 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 2 | -| achibb | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| NouamaneTazi | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2 | -| PhilipMay | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2 | -| cslizc | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| bakrianoo | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| hanhainebula | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| monikernemo | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -======= - | GitHub | New dataset | Review PR | Bug fixes | Coordination | Paper writing | Dataset annotations | Running Models | New task | Total | -|:------------------|--------------:|------------:|------------:|---------------:|----------------:|----------------------:|-----------------:|-----------:|--------:| -| KennethEnevoldsen | 68 | 326 | 87 | 81 | 0 | 35 | 0 | 0 | 597 | -| isaac-chung | 120 | 194 | 50 | 54 | 12 | 1 | 0 | 2 | 433 | -| imenelydiaker | 120 | 144 | 24 | 70 | 0 | 0 | 0 | 0 | 358 | -| awinml | 300 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 302 | -| x-tabdeveloping | 144 | 32 | 10 | 41 | 0 | 0 | 0 | 12 | 239 | -| davidstap | 176 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 176 | -| jaygala24 | 149 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 149 | -| wissam-sib | 134 | 6 | 4 | 0 | 0 | 0 | 0 | 0 | 144 | -| Muennighoff | 0 | 48 | 0 | 70 | 0 | 0 | 24 | 0 | 142 | -| orionw | 0 | 20 | 20 | 75 | 0 | 0 | 0 | 10 | 125 | -| dokato | 94 | 6 | 12 | 0 | 0 | 0 | 0 | 0 | 112 | -| gentaiscool | 110 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 110 | -| jupyterjazz | 108 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 108 | -| SaitejaUtpala | 102 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 102 | -| vaibhavad | 6 | 4 | 8 | 75 | 0 | 0 | 0 | 0 | 93 | -| schmarion | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | -| MathieuCiancone | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | -| GabrielSequeira | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | -| digantamisra98 | 71 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 71 | -| shreeya-dhakal | 54 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 62 | -| Rysias | 58 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 58 | -| Samoed | 18 | 2 | 22 | 0 | 0 | 0 | 9 | 0 | 51 | -| sivareddyg | 0 | 0 | 0 | 50 | 0 | 0 | 0 | 0 | 50 | -| gowitheflow-1998 | 50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 50 | -| asparius | 34 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 48 | -| Akash190104 | 46 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 46 | -| MartinBernstorff | 2 | 8 | 13 | 20 | 0 | 0 | 0 | 0 | 43 | -| akshita-sukhlecha | 36 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 40 | -| staoxiao | 40 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 40 | -| bp-high | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 36 | -| rafalposwiata | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 36 | -| KranthiGV | 20 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 34 | -| loicmagne | 0 | 0 | 28 | 0 | 0 | 0 | 0 | 0 | 28 | -| ShawonAshraf | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| bjoernpl | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| jphme | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| rasdani | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| violenil | 26 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | -| mariyahendriksen | 0 | 0 | 0 | 0 | 24 | 0 | 0 | 0 | 24 | -| dwzhu-pku | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 24 | -| hgissbkh | 0 | 2 | 13 | 0 | 3 | 0 | 0 | 5 | 23 | -| taeminlee | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | -| kwojtasi | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | -| jankounchained | 14 | 0 | 8 | 0 | 0 | 0 | 0 | 0 | 22 | -| tomaarsen | 0 | 2 | 0 | 20 | 0 | 0 | 0 | 0 | 22 | -| crystina-z | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | -| mrshu | 16 | 4 | 0 | 0 | 0 | 1 | 0 | 0 | 21 | -| john-b-yang | 0 | 0 | 0 | 0 | 20 | 0 | 0 | 0 | 20 | -| rbroc | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | -| mmhamdy | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | -| ManuelFay | 2 | 0 | 13 | 0 | 0 | 0 | 0 | 5 | 20 | -| AlexeyVatolin | 0 | 0 | 20 | 0 | 0 | 0 | 0 | 0 | 20 | -| Andrian0s | 14 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 20 | -| thakur-nandan | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | -| manandey | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | -| PranjalChitale | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | -| dipam7 | 14 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | -| sted97 | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | -| Sakshamrzt | 12 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | -| taidnguyen | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | -| artemsnegirev | 12 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 14 | -| slvnwhrl | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| anpalmak2003 | 9 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 12 | -| Art3mis07 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| guenthermi | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| jordiclive | 2 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 12 | -| xhluca | 6 | 2 | 4 | 0 | 0 | 0 | 0 | 0 | 12 | -| henilp105 | 0 | 0 | 2 | 0 | 0 | 9 | 0 | 0 | 11 | -| MariyaTikhonova | 7 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 11 | -| ab1992ao | 8 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 11 | -| tmp_handle | 0 | 0 | 0 | 10 | 0 | 0 | 0 | 0 | 10 | -| swj0419 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| Ruqyai | 2 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| ZhengLiu101 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| Alenush | 6 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 10 | -| ABorghini | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| simon-clematide | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| sarahooker | 0 | 0 | 0 | 0 | 10 | 0 | 0 | 0 | 10 | -| guangyusong | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| HLasse | 0 | 0 | 5 | 0 | 0 | 5 | 0 | 0 | 10 | -| cassanof | 8 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 10 | -| hongjin-su | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| xiamengzhou | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| xu3kev | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| howard-yen | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| malteos | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| ljvmiranda921 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| marcobellagente93 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| izhx | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| MexicanLemonade | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| antoniolanza1996 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2 | -| achibb | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| NouamaneTazi | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| PhilipMay | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| cslizc | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| bakrianoo | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| hanhainebula | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| monikernemo | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | ->>>>>>> main + | GitHub | New dataset | Review PR | Running Models | Bug fixes | Coordination | Dataset annotations | Paper writing | New task | Total | +|:------------------|--------------:|------------:|-----------------:|------------:|---------------:|----------------------:|----------------:|-----------:|--------:| +| KennethEnevoldsen | 68 | 326 | 0 | 87 | 81 | 35 | 0 | 0 | 597 | +| isaac-chung | 120 | 194 | 0 | 50 | 54 | 1 | 12 | 2 | 433 | +| imenelydiaker | 120 | 144 | 0 | 24 | 70 | 0 | 0 | 0 | 358 | +| awinml | 300 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 302 | +| x-tabdeveloping | 144 | 32 | 0 | 10 | 41 | 0 | 0 | 12 | 239 | +| davidstap | 176 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 176 | +| jaygala24 | 149 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 149 | +| wissam-sib | 134 | 6 | 0 | 4 | 0 | 0 | 0 | 0 | 144 | +| Muennighoff | 0 | 48 | 24 | 0 | 70 | 0 | 0 | 0 | 142 | +| orionw | 0 | 20 | 0 | 20 | 75 | 0 | 0 | 10 | 125 | +| dokato | 94 | 6 | 0 | 12 | 0 | 0 | 0 | 0 | 112 | +| gentaiscool | 110 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 110 | +| jupyterjazz | 108 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 108 | +| SaitejaUtpala | 102 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 102 | +| vaibhavad | 6 | 4 | 0 | 8 | 75 | 0 | 0 | 0 | 93 | +| schmarion | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | +| MathieuCiancone | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | +| GabrielSequeira | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | +| digantamisra98 | 71 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 71 | +| shreeya-dhakal | 54 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 62 | +| Rysias | 58 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 58 | +| Samoed | 18 | 2 | 9 | 22 | 0 | 0 | 0 | 0 | 51 | +| sivareddyg | 0 | 0 | 0 | 0 | 50 | 0 | 0 | 0 | 50 | +| gowitheflow-1998 | 50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 50 | +| asparius | 34 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 48 | +| Akash190104 | 46 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 46 | +| MartinBernstorff | 2 | 8 | 0 | 13 | 20 | 0 | 0 | 0 | 43 | +| akshita-sukhlecha | 36 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 40 | +| staoxiao | 40 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 40 | +| bp-high | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 36 | +| rafalposwiata | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 36 | +| KranthiGV | 20 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 34 | +| loicmagne | 0 | 0 | 0 | 28 | 0 | 0 | 0 | 0 | 28 | +| ShawonAshraf | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | +| bjoernpl | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | +| jphme | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | +| rasdani | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | +| violenil | 26 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | +| mariyahendriksen | 0 | 0 | 0 | 0 | 0 | 0 | 24 | 0 | 24 | +| dwzhu-pku | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 24 | +| hgissbkh | 0 | 2 | 0 | 13 | 0 | 0 | 3 | 5 | 23 | +| taeminlee | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | +| kwojtasi | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | +| jankounchained | 14 | 0 | 0 | 8 | 0 | 0 | 0 | 0 | 22 | +| tomaarsen | 0 | 2 | 0 | 0 | 20 | 0 | 0 | 0 | 22 | +| crystina-z | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | +| mrshu | 16 | 4 | 0 | 0 | 0 | 1 | 0 | 0 | 21 | +| john-b-yang | 0 | 0 | 0 | 0 | 0 | 0 | 20 | 0 | 20 | +| rbroc | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | +| mmhamdy | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | +| ManuelFay | 2 | 0 | 0 | 13 | 0 | 0 | 0 | 5 | 20 | +| AlexeyVatolin | 0 | 0 | 0 | 20 | 0 | 0 | 0 | 0 | 20 | +| Andrian0s | 14 | 4 | 0 | 2 | 0 | 0 | 0 | 0 | 20 | +| thakur-nandan | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | +| manandey | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | +| PranjalChitale | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| dipam7 | 14 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| sted97 | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| Sakshamrzt | 12 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| taidnguyen | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | +| artemsnegirev | 12 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 14 | +| slvnwhrl | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| anpalmak2003 | 9 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 12 | +| Art3mis07 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| guenthermi | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| jordiclive | 2 | 0 | 0 | 10 | 0 | 0 | 0 | 0 | 12 | +| xhluca | 6 | 2 | 0 | 4 | 0 | 0 | 0 | 0 | 12 | +| henilp105 | 0 | 0 | 0 | 2 | 0 | 9 | 0 | 0 | 11 | +| MariyaTikhonova | 7 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 11 | +| ab1992ao | 8 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 11 | +| tmp_handle | 0 | 0 | 0 | 0 | 10 | 0 | 0 | 0 | 10 | +| swj0419 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| Ruqyai | 2 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| ZhengLiu101 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| Alenush | 6 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 10 | +| ABorghini | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| simon-clematide | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| sarahooker | 0 | 0 | 0 | 0 | 0 | 0 | 10 | 0 | 10 | +| guangyusong | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| HLasse | 0 | 0 | 0 | 5 | 0 | 5 | 0 | 0 | 10 | +| cassanof | 8 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 10 | +| hongjin-su | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| xiamengzhou | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| xu3kev | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| howard-yen | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| malteos | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| ljvmiranda921 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| marcobellagente93 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| izhx | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| MexicanLemonade | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| antoniolanza1996 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 2 | +| achibb | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| NouamaneTazi | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| PhilipMay | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| cslizc | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| bakrianoo | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| hanhainebula | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| monikernemo | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | \ No newline at end of file diff --git a/docs/tasks.md b/docs/tasks.md index d4e6b376ad..bc9a4e99a4 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -12,38 +12,17 @@ The following tables give you an overview of the tasks in MTEB. | [AILAStatutes](https://zenodo.org/records/4063986) | ['eng'] | Retrieval | p2p | [Legal, Written] | None | None | | [AJGT](https://link.springer.com/chapter/10.1007/978-3-319-60042-0_66/) (Alomari et al., 2017) | ['ara'] | Classification | s2s | [Social, Written] | None | None | | [ARCChallenge](https://allenai.org/data/arc) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | +| [AROCocoOrder](https://proceedings.neurips.cc/paper_files/paper/2023/hash/63461de0b4cb760fc498e85b18a7fe81-Abstract-Datasets_and_Benchmarks.html) (Hsieh et al., 2024) | ['eng'] | ImageTextPairClassification | i2t | [Encyclopaedic] | None | None | +| [AROFlickrOrder](https://proceedings.neurips.cc/paper_files/paper/2023/hash/63461de0b4cb760fc498e85b18a7fe81-Abstract-Datasets_and_Benchmarks.html) (Hsieh et al., 2024) | ['eng'] | ImageTextPairClassification | i2t | [Encyclopaedic] | None | None | +| [AROVisualAttribution](https://openreview.net/forum?id=KRLUvxh8uaX) (Yuksekgonul et al., 2023) | ['eng'] | ImageTextPairClassification | i2t | [Encyclopaedic] | None | None | +| [AROVisualRelation](https://openreview.net/forum?id=KRLUvxh8uaX) (Yuksekgonul et al., 2023) | ['eng'] | ImageTextPairClassification | i2t | [Encyclopaedic] | None | None | | [ATEC](https://aclanthology.org/2021.emnlp-main.357) | ['cmn'] | STS | s2s | | None | None | -<<<<<<< HEAD -| [AfriSentiClassification](https://arxiv.org/abs/2302.08956) | ['amh', 'arq', 'ary', 'hau', 'ibo', 'kin', 'pcm', 'por', 'swa', 'tso', 'twi', 'yor'] | Classification | s2s | [Social, Written] | {'test': 2048} | {'test': 74.77} | -| [AfriSentiLangClassification](https://huggingface.co/datasets/HausaNLP/afrisenti-lid-data/) | ['amh', 'arq', 'ary', 'hau', 'ibo', 'kin', 'pcm', 'por', 'swa', 'tso', 'twi', 'yor'] | Classification | s2s | [Social, Written] | {'test': 5754} | {'test': 77.84} | -| [AllegroReviews](https://aclanthology.org/2020.acl-main.111.pdf) | ['pol'] | Classification | s2s | | {'test': 1006} | {'test': 477.2} | -| [AlloProfClusteringP2P.v2](https://huggingface.co/datasets/lyon-nlp/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Clustering | p2p | [Encyclopaedic, Written] | {'test': 2556} | {'test': 3539.5} | -| [AlloProfClusteringS2S.v2](https://huggingface.co/datasets/lyon-nlp/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Clustering | s2s | [Encyclopaedic, Written] | {'test': 2556} | {'test': 32.8} | -| [AlloprofReranking](https://huggingface.co/datasets/antoinelb7/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Reranking | s2p | [Web, Academic, Written] | {'test': 2316, 'train': 9264} | None | -| [AlloprofRetrieval](https://huggingface.co/datasets/antoinelb7/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Retrieval | s2p | [Encyclopaedic, Written] | {'train': 2048} | {'test': {'average_document_length': 3505.705399061033, 'average_query_length': 170.71286701208982, 'num_documents': 2556, 'num_queries': 2316, 'average_relevant_docs_per_query': 1.0}} | -| [AlphaNLI](https://leaderboard.allenai.org/anli/submissions/get-started) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | {'test': 1532} | {'test': {'average_document_length': 43.42647308646886, 'average_query_length': 103.05483028720627, 'num_documents': 241347, 'num_queries': 1532, 'average_relevant_docs_per_query': 1.0}} | -| [AmazonCounterfactualClassification](https://arxiv.org/abs/2104.06893) | ['deu', 'eng', 'jpn'] | Classification | s2s | [Reviews, Written] | {'validation': 335, 'test': 670} | {'validation': 109.2, 'test': 106.1} | -| [AmazonPolarityClassification](https://huggingface.co/datasets/amazon_polarity) (Julian McAuley, 2013) | ['eng'] | Classification | p2p | [Reviews, Written] | {'test': 400000} | {'test': 431.4} | -| [AmazonReviewsClassification](https://arxiv.org/abs/2010.02573) (Phillip Keung, 2020) | ['cmn', 'deu', 'eng', 'fra', 'jpn', 'spa'] | Classification | s2s | [Reviews, Written] | {'validation': 30000, 'test': 30000} | {'validation': 159.2, 'test': 160.4} | -| [AngryTweetsClassification](https://aclanthology.org/2021.nodalida-main.53/) (Pauli et al., 2021) | ['dan'] | Classification | s2s | [Social, Written] | {'test': 1050} | {'test': 156.1} | -| [AppsRetrieval](https://arxiv.org/abs/2105.09938) (Dan Hendrycks, 2021) | ['eng', 'python'] | Retrieval | p2p | [Programming, Written] | {'test': 1000} | {'test': {'average_document_length': 575.0086708499715, 'average_query_length': 1669.8284196547145, 'num_documents': 8765, 'num_queries': 3765, 'average_relevant_docs_per_query': 1.0}} | -| [ArEntail](https://link.springer.com/article/10.1007/s10579-024-09731-1) (Obeidat et al., 2024) | ['ara'] | PairClassification | s2s | [News, Written] | {'test': 1000} | {'test': 65.77} | -| [ArXivHierarchicalClusteringP2P](https://www.kaggle.com/Cornell-University/arxiv) | ['eng'] | Clustering | p2p | [Academic, Written] | {'test': 2048} | {'test': {'num_samples': 2048, 'average_text_length': 1008.439453125, 'average_labels_per_text': 1.46337890625, 'unique_labels': 129, 'labels': {'cs': {'count': 356}, 'math': {'count': 381}, 'OC': {'count': 11}, 'hep-lat': {'count': 13}, 'hep': {'count': 98}, 'astro-ph': {'count': 213}, 'eess': {'count': 76}, 'quant-ph': {'count': 135}, 'DC': {'count': 5}, 'cond-mat': {'count': 274}, 'hep-th': {'count': 66}, 'SP': {'count': 33}, 'hep-ph': {'count': 69}, 'FA': {'count': 6}, 'nucl-th': {'count': 17}, 'q-bio': {'count': 80}, 'HE': {'count': 22}, 'HC': {'count': 2}, 'stat': {'count': 60}, 'ML': {'count': 16}, 'IV': {'count': 13}, 'stat-mech': {'count': 47}, 'DS': {'count': 14}, 'ME': {'count': 12}, 'CC': {'count': 2}, 'mtrl-sci': {'count': 22}, 'PE': {'count': 16}, 'NT': {'count': 11}, 'SC': {'count': 6}, 'AG': {'count': 13}, 'physics': {'count': 81}, 'ins-det': {'count': 9}, 'GA': {'count': 18}, 'BM': {'count': 6}, 'GN': {'count': 17}, 'NA': {'count': 15}, 'app-ph': {'count': 7}, 'RT': {'count': 6}, 'other': {'count': 37}, 'soft': {'count': 15}, 'CO': {'count': 33}, 'supr-con': {'count': 21}, 'chem-ph': {'count': 3}, 'DM': {'count': 2}, 'MN': {'count': 12}, 'q-fin': {'count': 27}, 'PM': {'count': 2}, 'AP': {'count': 27}, 'gr-qc': {'count': 15}, 'quant-gas': {'count': 8}, 'mes-hall': {'count': 33}, 'IT': {'count': 19}, 'SI': {'count': 6}, 'SG': {'count': 3}, 'bio-ph': {'count': 2}, 'SR': {'count': 16}, 'soc-ph': {'count': 5}, 'hep-ex': {'count': 15}, 'DG': {'count': 11}, 'NE': {'count': 5}, 'CR': {'count': 6}, 'CL': {'count': 12}, 'RM': {'count': 3}, 'econ': {'count': 17}, 'nlin': {'count': 5}, 'PS': {'count': 1}, 'LG': {'count': 26}, 'QA': {'count': 9}, 'str-el': {'count': 26}, 'CV': {'count': 34}, 'MF': {'count': 6}, 'IM': {'count': 7}, 'EM': {'count': 6}, 'TH': {'count': 5}, 'PR': {'count': 20}, 'AT': {'count': 4}, 'OA': {'count': 4}, 'CP': {'count': 6}, 'LO': {'count': 14}, 'flu-dyn': {'count': 6}, 'atom-ph': {'count': 8}, 'class-ph': {'count': 1}, 'SY': {'count': 20}, 'IR': {'count': 1}, 'plasm-ph': {'count': 8}, 'CE': {'count': 2}, 'AO': {'count': 1}, 'comp-ph': {'count': 3}, 'optics': {'count': 12}, 'MG': {'count': 4}, 'ST': {'count': 6}, 'nucl-ex': {'count': 6}, 'CY': {'count': 9}, 'ao-ph': {'count': 2}, 'DB': {'count': 1}, 'math-ph': {'count': 10}, 'NC': {'count': 13}, 'GT': {'count': 11}, 'TO': {'count': 2}, 'AI': {'count': 9}, 'NI': {'count': 2}, 'gen-ph': {'count': 4}, 'OT': {'count': 4}, 'SD': {'count': 2}, 'dis-nn': {'count': 4}, 'RO': {'count': 7}, 'CA': {'count': 6}, 'FL': {'count': 1}, 'SE': {'count': 5}, 'EP': {'count': 9}, 'hist-ph': {'count': 1}, 'QM': {'count': 9}, 'ed-ph': {'count': 2}, 'GR': {'count': 4}, 'MS': {'count': 1}, 'CD': {'count': 1}, 'ET': {'count': 1}, 'acc-ph': {'count': 5}, 'AC': {'count': 2}, 'OH': {'count': 1}, 'EC': {'count': 2}, 'DL': {'count': 1}, 'AS': {'count': 3}, 'geo-ph': {'count': 2}, 'CG': {'count': 3}, 'CB': {'count': 1}, 'AR': {'count': 1}, 'TR': {'count': 1}, 'atm-clus': {'count': 1}}}} | -| [ArXivHierarchicalClusteringS2S](https://www.kaggle.com/Cornell-University/arxiv) | ['eng'] | Clustering | p2p | [Academic, Written] | {'test': 2048} | {'test': 1009.98} | -| [ArguAna](http://argumentation.bplaced.net/arguana/data) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Medical, Written] | None | {'test': {'average_document_length': 1029.2327645838136, 'average_query_length': 1192.7204836415362, 'num_documents': 8674, 'num_queries': 1406, 'average_relevant_docs_per_query': 1.0}} | -| [ArguAna-PL](https://huggingface.co/datasets/clarin-knext/arguana-pl) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | {'test': {'average_document_length': 1060.702674659903, 'average_query_length': 1224.8022759601706, 'num_documents': 8674, 'num_queries': 1406, 'average_relevant_docs_per_query': 1.0}} | -| [ArmenianParaphrasePC](https://github.com/ivannikov-lab/arpa-paraphrase-corpus) (Arthur Malajyan, 2020) | ['hye'] | PairClassification | s2s | [News, Written] | {'train': 4023, 'test': 1470} | {'train': 243.81, 'test': 241.37} | -| [ArxivClassification](https://ieeexplore.ieee.org/document/8675939) (He et al., 2019) | ['eng'] | Classification | s2s | [Academic, Written] | {'test': 2048} | {} | -| [AskUbuntuDupQuestions](https://github.com/taolei87/askubuntu) | ['eng'] | Reranking | s2s | | {'test': 2255} | {'test': {'num_samples': 375, 'num_positive': 375, 'num_negative': 375, 'avg_query_len': 50.205333333333336, 'avg_positive_len': 6.013333333333334, 'avg_negative_len': 13.986666666666666}} | -| [Assin2RTE](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) (Real et al., 2020) | ['por'] | PairClassification | s2s | [Written] | {'test': 2448} | {'test': 53.55} | -| [Assin2STS](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) (Real et al., 2020) | ['por'] | STS | s2s | [Written] | {'test': 2448} | {'test': 53.55} | -======= | [AfriSentiClassification](https://arxiv.org/abs/2302.08956) | ['amh', 'arq', 'ary', 'hau', 'ibo', 'kin', 'pcm', 'por', 'swa', 'tso', 'twi', 'yor'] | Classification | s2s | [Social, Written] | None | None | | [AfriSentiLangClassification](https://huggingface.co/datasets/HausaNLP/afrisenti-lid-data/) | ['amh', 'arq', 'ary', 'hau', 'ibo', 'kin', 'pcm', 'por', 'swa', 'tso', 'twi', 'yor'] | Classification | s2s | [Social, Written] | None | None | | [AllegroReviews](https://aclanthology.org/2020.acl-main.111.pdf) | ['pol'] | Classification | s2s | | None | None | | [AlloProfClusteringP2P.v2](https://huggingface.co/datasets/lyon-nlp/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Clustering | p2p | [Encyclopaedic, Written] | None | None | | [AlloProfClusteringS2S.v2](https://huggingface.co/datasets/lyon-nlp/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Clustering | s2s | [Encyclopaedic, Written] | None | None | -| [AlloprofReranking](https://huggingface.co/datasets/antoinelb7/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Reranking | s2p | [Web, Academic, Written] | None | None | +| [AlloprofReranking](https://huggingface.co/datasets/antoinelb7/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Reranking | s2p | [Academic, Web, Written] | None | None | | [AlloprofRetrieval](https://huggingface.co/datasets/antoinelb7/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [AlphaNLI](https://leaderboard.allenai.org/anli/submissions/get-started) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [AmazonCounterfactualClassification](https://arxiv.org/abs/2104.06893) | ['deu', 'eng', 'jpn'] | Classification | s2s | [Reviews, Written] | None | None | @@ -55,229 +34,86 @@ The following tables give you an overview of the tasks in MTEB. | [ArXivHierarchicalClusteringP2P](https://www.kaggle.com/Cornell-University/arxiv) | ['eng'] | Clustering | p2p | [Academic, Written] | {'test': 2048} | {'test': {'num_samples': 2048, 'number_of_characters': 2065284, 'min_text_length': 103, 'average_text_length': 1008.44, 'max_text_length': 2103, 'min_labels_per_text': 1, 'average_labels_per_text': 1.46, 'max_labels_per_text': 381, 'unique_labels': 129, 'labels': {'cs': {'count': 356}, 'math': {'count': 381}, 'OC': {'count': 11}, 'hep-lat': {'count': 13}, 'hep': {'count': 98}, 'astro-ph': {'count': 213}, 'eess': {'count': 76}, 'quant-ph': {'count': 135}, 'DC': {'count': 5}, 'cond-mat': {'count': 274}, 'hep-th': {'count': 66}, 'SP': {'count': 33}, 'hep-ph': {'count': 69}, 'FA': {'count': 6}, 'nucl-th': {'count': 17}, 'q-bio': {'count': 80}, 'HE': {'count': 22}, 'HC': {'count': 2}, 'stat': {'count': 60}, 'ML': {'count': 16}, 'IV': {'count': 13}, 'stat-mech': {'count': 47}, 'DS': {'count': 14}, 'ME': {'count': 12}, 'CC': {'count': 2}, 'mtrl-sci': {'count': 22}, 'PE': {'count': 16}, 'NT': {'count': 11}, 'SC': {'count': 6}, 'AG': {'count': 13}, 'physics': {'count': 81}, 'ins-det': {'count': 9}, 'GA': {'count': 18}, 'BM': {'count': 6}, 'GN': {'count': 17}, 'NA': {'count': 15}, 'app-ph': {'count': 7}, 'RT': {'count': 6}, 'other': {'count': 37}, 'soft': {'count': 15}, 'CO': {'count': 33}, 'supr-con': {'count': 21}, 'chem-ph': {'count': 3}, 'DM': {'count': 2}, 'MN': {'count': 12}, 'q-fin': {'count': 27}, 'PM': {'count': 2}, 'AP': {'count': 27}, 'gr-qc': {'count': 15}, 'quant-gas': {'count': 8}, 'mes-hall': {'count': 33}, 'IT': {'count': 19}, 'SI': {'count': 6}, 'SG': {'count': 3}, 'bio-ph': {'count': 2}, 'SR': {'count': 16}, 'soc-ph': {'count': 5}, 'hep-ex': {'count': 15}, 'DG': {'count': 11}, 'NE': {'count': 5}, 'CR': {'count': 6}, 'CL': {'count': 12}, 'RM': {'count': 3}, 'econ': {'count': 17}, 'nlin': {'count': 5}, 'PS': {'count': 1}, 'LG': {'count': 26}, 'QA': {'count': 9}, 'str-el': {'count': 26}, 'CV': {'count': 34}, 'MF': {'count': 6}, 'IM': {'count': 7}, 'EM': {'count': 6}, 'TH': {'count': 5}, 'PR': {'count': 20}, 'AT': {'count': 4}, 'OA': {'count': 4}, 'CP': {'count': 6}, 'LO': {'count': 14}, 'flu-dyn': {'count': 6}, 'atom-ph': {'count': 8}, 'class-ph': {'count': 1}, 'SY': {'count': 20}, 'IR': {'count': 1}, 'plasm-ph': {'count': 8}, 'CE': {'count': 2}, 'AO': {'count': 1}, 'comp-ph': {'count': 3}, 'optics': {'count': 12}, 'MG': {'count': 4}, 'ST': {'count': 6}, 'nucl-ex': {'count': 6}, 'CY': {'count': 9}, 'ao-ph': {'count': 2}, 'DB': {'count': 1}, 'math-ph': {'count': 10}, 'NC': {'count': 13}, 'GT': {'count': 11}, 'TO': {'count': 2}, 'AI': {'count': 9}, 'NI': {'count': 2}, 'gen-ph': {'count': 4}, 'OT': {'count': 4}, 'SD': {'count': 2}, 'dis-nn': {'count': 4}, 'RO': {'count': 7}, 'CA': {'count': 6}, 'FL': {'count': 1}, 'SE': {'count': 5}, 'EP': {'count': 9}, 'hist-ph': {'count': 1}, 'QM': {'count': 9}, 'ed-ph': {'count': 2}, 'GR': {'count': 4}, 'MS': {'count': 1}, 'CD': {'count': 1}, 'ET': {'count': 1}, 'acc-ph': {'count': 5}, 'AC': {'count': 2}, 'OH': {'count': 1}, 'EC': {'count': 2}, 'DL': {'count': 1}, 'AS': {'count': 3}, 'geo-ph': {'count': 2}, 'CG': {'count': 3}, 'CB': {'count': 1}, 'AR': {'count': 1}, 'TR': {'count': 1}, 'atm-clus': {'count': 1}}}} | | [ArXivHierarchicalClusteringS2S](https://www.kaggle.com/Cornell-University/arxiv) | ['eng'] | Clustering | p2p | [Academic, Written] | None | None | | [ArguAna](http://argumentation.bplaced.net/arguana/data) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Medical, Written] | None | None | -| [ArguAna-PL](https://huggingface.co/datasets/clarin-knext/arguana-pl) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | +| [ArguAna-Fa](https://huggingface.co/datasets/MCINext/arguana-fa) | ['fas'] | Retrieval | s2p | [Blog] | None | None | +| [ArguAna-PL](https://huggingface.co/datasets/clarin-knext/arguana-pl) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Medical, Written] | None | None | | [ArmenianParaphrasePC](https://github.com/ivannikov-lab/arpa-paraphrase-corpus) (Arthur Malajyan, 2020) | ['hye'] | PairClassification | s2s | [News, Written] | None | None | | [ArxivClassification](https://ieeexplore.ieee.org/document/8675939) (He et al., 2019) | ['eng'] | Classification | s2s | [Academic, Written] | None | None | -| [AskUbuntuDupQuestions](https://github.com/taolei87/askubuntu) | ['eng'] | Reranking | s2s | | {'test': 375} | {'test': {'num_samples': 375, 'number_of_characters': 413674, 'num_positive': 2255, 'num_negative': 5245, 'min_query_length': 17, 'avg_query_length': 50.21, 'max_query_length': 148, 'unique_query': 374, 'min_positive_length': 15, 'avg_positive_length': 52.54, 'max_positive_length': 152, 'unique_positive': 2165, 'min_negative_length': 15, 'avg_negative_length': 52.69, 'max_negative_length': 148, 'unique_negative': 5002}} | +| [AskUbuntuDupQuestions](https://github.com/taolei87/askubuntu) | ['eng'] | Reranking | s2s | [Programming, Web] | {'test': 375} | {'test': {'num_samples': 375, 'number_of_characters': 413674, 'num_positive': 2255, 'num_negative': 5245, 'min_query_length': 17, 'avg_query_length': 50.21, 'max_query_length': 148, 'unique_query': 374, 'min_positive_length': 15, 'avg_positive_length': 52.54, 'max_positive_length': 152, 'unique_positive': 2165, 'min_negative_length': 15, 'avg_negative_length': 52.69, 'max_negative_length': 148, 'unique_negative': 5002}} | | [Assin2RTE](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) (Real et al., 2020) | ['por'] | PairClassification | s2s | [Written] | None | None | | [Assin2STS](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) (Real et al., 2020) | ['por'] | STS | s2s | [Written] | None | None | -| [AutoRAGRetrieval](https://arxiv.org/abs/2410.20878) (Dongkyu Kim, 2024) | ['kor'] | Retrieval | s2p | [Government, Medical, Legal, Social] | {'test': 834} | {'test': {'number_of_characters': 894.22, 'num_samples': 834, 'num_queries': 114, 'num_documents': 720, 'average_document_length': 1.15, 'average_query_length': 0.61, 'average_relevant_docs_per_query': 1.0}} | ->>>>>>> main -| [BIOSSES](https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html) (SoฤŸancฤฑoฤŸlu et al., 2017) | ['eng'] | STS | s2s | | None | None | +| [AutoRAGRetrieval](https://arxiv.org/abs/2410.20878) (Dongkyu Kim, 2024) | ['kor'] | Retrieval | s2p | [Financial, Government, Legal, Medical, Social] | {'test': 834} | {'test': {'number_of_characters': 894.22, 'num_samples': 834, 'num_queries': 114, 'num_documents': 720, 'average_document_length': 1.15, 'average_query_length': 0.61, 'average_relevant_docs_per_query': 1.0}} | +| [BIOSSES](https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html) (SoฤŸancฤฑoฤŸlu et al., 2017) | ['eng'] | STS | s2s | [Medical] | None | None | +| [BLINKIT2IMultiChoice](https://arxiv.org/abs/2404.12390) (Fu et al., 2024) | ['eng'] | Any2AnyMultiChoice | it2i | [Encyclopaedic] | None | None | +| [BLINKIT2IRetrieval](https://arxiv.org/abs/2404.12390) (Fu et al., 2024) | ['eng'] | Any2AnyRetrieval | it2i | [Encyclopaedic] | None | None | +| [BLINKIT2TMultiChoice](https://arxiv.org/abs/2404.12390) (Fu et al., 2024) | ['eng'] | Any2AnyMultiChoice | it2t | [Encyclopaedic] | None | None | +| [BLINKIT2TRetrieval](https://arxiv.org/abs/2404.12390) (Fu et al., 2024) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | None | None | | [BQ](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None | | [BSARDRetrieval](https://huggingface.co/datasets/maastrichtlawtech/bsard) (Louis et al., 2022) | ['fra'] | Retrieval | s2p | [Legal, Spoken] | None | None | | [BUCC.v2](https://comparable.limsi.fr/bucc2018/bucc2018-task.html) | ['cmn', 'deu', 'eng', 'fra', 'rus'] | BitextMining | s2s | [Written] | {'test': 35000} | {'test': {'num_samples': 35000, 'number_of_characters': 6640032, 'unique_pairs': 34978, 'min_sentence1_length': 16, 'average_sentence1_length': 99.11, 'max_sentence1_length': 204, 'unique_sentence1': 34978, 'min_sentence2_length': 42, 'average_sentence2_length': 90.61, 'max_sentence2_length': 159, 'unique_sentence2': 25306, 'hf_subset_descriptive_stats': {'de-en': {'num_samples': 9580, 'number_of_characters': 1919197, 'unique_pairs': 9573, 'min_sentence1_length': 50, 'average_sentence1_length': 109.08, 'max_sentence1_length': 204, 'unique_sentence1': 9573, 'min_sentence2_length': 46, 'average_sentence2_length': 91.25, 'max_sentence2_length': 155, 'unique_sentence2': 9570}, 'fr-en': {'num_samples': 9086, 'number_of_characters': 1677545, 'unique_pairs': 9081, 'min_sentence1_length': 43, 'average_sentence1_length': 99.32, 'max_sentence1_length': 174, 'unique_sentence1': 9081, 'min_sentence2_length': 42, 'average_sentence2_length': 85.31, 'max_sentence2_length': 159, 'unique_sentence2': 9076}, 'ru-en': {'num_samples': 14435, 'number_of_characters': 2808206, 'unique_pairs': 14425, 'min_sentence1_length': 40, 'average_sentence1_length': 101.66, 'max_sentence1_length': 186, 'unique_sentence1': 14425, 'min_sentence2_length': 45, 'average_sentence2_length': 92.88, 'max_sentence2_length': 159, 'unique_sentence2': 14424}, 'zh-en': {'num_samples': 1899, 'number_of_characters': 235084, 'unique_pairs': 1899, 'min_sentence1_length': 16, 'average_sentence1_length': 28.43, 'max_sentence1_length': 40, 'unique_sentence1': 1899, 'min_sentence2_length': 48, 'average_sentence2_length': 95.36, 'max_sentence2_length': 159, 'unique_sentence2': 1899}}}} | | [Banking77Classification](https://arxiv.org/abs/2003.04807) | ['eng'] | Classification | s2s | [Written] | None | None | -| [BelebeleRetrieval](https://arxiv.org/abs/2308.16884) (Lucas Bandarkar, 2023) | ['acm', 'afr', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'azj', 'bam', 'ben', 'bod', 'bul', 'cat', 'ceb', 'ces', 'ckb', 'dan', 'deu', 'ell', 'eng', 'est', 'eus', 'fin', 'fra', 'fuv', 'gaz', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kac', 'kan', 'kat', 'kaz', 'kea', 'khk', 'khm', 'kin', 'kir', 'kor', 'lao', 'lin', 'lit', 'lug', 'luo', 'lvs', 'mal', 'mar', 'mkd', 'mlt', 'mri', 'mya', 'nld', 'nob', 'npi', 'nso', 'nya', 'ory', 'pan', 'pbt', 'pes', 'plt', 'pol', 'por', 'ron', 'rus', 'shn', 'sin', 'slk', 'slv', 'sna', 'snd', 'som', 'sot', 'spa', 'srp', 'ssw', 'sun', 'swe', 'swh', 'tam', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tsn', 'tso', 'tur', 'ukr', 'urd', 'uzn', 'vie', 'war', 'wol', 'xho', 'yor', 'zho', 'zsm', 'zul'] | Retrieval | s2p | [Web, News, Written] | {'test': 521866} | {'test': {'number_of_characters': 25574620, 'num_samples': 521866, 'num_queries': 338378, 'num_documents': 183488, 'min_document_length': 4, 'average_document_length': 137.38, 'max_document_length': 237, 'unique_documents': 183488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 338378, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 183488, 'hf_subset_descriptive_stats': {'acm_Arab-acm_Arab': {'number_of_characters': 51232, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 102.98, 'max_document_length': 129, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'acm_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-acm_Arab': {'number_of_characters': 51232, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 102.98, 'max_document_length': 129, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'afr_Latn-afr_Latn': {'number_of_characters': 71217, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 143.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'afr_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-afr_Latn': {'number_of_characters': 71217, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 143.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'als_Latn-als_Latn': {'number_of_characters': 69498, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 140.41, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'als_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-als_Latn': {'number_of_characters': 69498, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 140.41, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'amh_Ethi-amh_Ethi': {'number_of_characters': 45221, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 90.67, 'max_document_length': 100, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'amh_Ethi-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-amh_Ethi': {'number_of_characters': 45221, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 90.67, 'max_document_length': 100, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'apc_Arab-apc_Arab': {'number_of_characters': 51248, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 103.02, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'apc_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-apc_Arab': {'number_of_characters': 51248, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 103.02, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Arab-arb_Arab': {'number_of_characters': 53671, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 107.98, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-arb_Arab': {'number_of_characters': 53671, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 107.98, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Latn-arb_Latn': {'number_of_characters': 61298, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 123.61, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-arb_Latn': {'number_of_characters': 61298, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 123.61, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ars_Arab-ars_Arab': {'number_of_characters': 51765, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 104.08, 'max_document_length': 119, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ars_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ars_Arab': {'number_of_characters': 51765, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 104.08, 'max_document_length': 119, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ary_Arab-ary_Arab': {'number_of_characters': 60261, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 121.49, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ary_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ary_Arab': {'number_of_characters': 60261, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 121.49, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arz_Arab-arz_Arab': {'number_of_characters': 52403, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 105.38, 'max_document_length': 115, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arz_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-arz_Arab': {'number_of_characters': 52403, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 105.38, 'max_document_length': 115, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'asm_Beng-asm_Beng': {'number_of_characters': 62410, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 4, 'average_document_length': 125.89, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'asm_Beng-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-asm_Beng': {'number_of_characters': 62410, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 4, 'average_document_length': 125.89, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'azj_Latn-azj_Latn': {'number_of_characters': 67137, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.58, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'azj_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-azj_Latn': {'number_of_characters': 67137, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.58, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bam_Latn-bam_Latn': {'number_of_characters': 66084, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 133.42, 'max_document_length': 166, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bam_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-bam_Latn': {'number_of_characters': 66084, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 133.42, 'max_document_length': 166, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Beng-ben_Beng': {'number_of_characters': 63512, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 128.15, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Beng-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ben_Beng': {'number_of_characters': 63512, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 128.15, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Latn-ben_Latn': {'number_of_characters': 68285, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 137.93, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ben_Latn': {'number_of_characters': 68285, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 137.93, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bod_Tibt-bod_Tibt': {'number_of_characters': 79188, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.27, 'max_document_length': 213, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bod_Tibt-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-bod_Tibt': {'number_of_characters': 79188, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.27, 'max_document_length': 213, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bul_Cyrl-bul_Cyrl': {'number_of_characters': 66577, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.43, 'max_document_length': 177, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bul_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-bul_Cyrl': {'number_of_characters': 66577, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.43, 'max_document_length': 177, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'cat_Latn-cat_Latn': {'number_of_characters': 68842, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.07, 'max_document_length': 163, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'cat_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-cat_Latn': {'number_of_characters': 68842, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.07, 'max_document_length': 163, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ceb_Latn-ceb_Latn': {'number_of_characters': 74053, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 149.75, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ceb_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ceb_Latn': {'number_of_characters': 74053, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 149.75, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ces_Latn-ces_Latn': {'number_of_characters': 61936, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 124.92, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ces_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ces_Latn': {'number_of_characters': 61936, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 124.92, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ckb_Arab-ckb_Arab': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 131.03, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ckb_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ckb_Arab': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 131.03, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'dan_Latn-dan_Latn': {'number_of_characters': 66648, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.57, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'dan_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-dan_Latn': {'number_of_characters': 66648, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.57, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'deu_Latn-deu_Latn': {'number_of_characters': 68768, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 138.92, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'deu_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-deu_Latn': {'number_of_characters': 68768, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 138.92, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ell_Grek-ell_Grek': {'number_of_characters': 79210, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.32, 'max_document_length': 212, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ell_Grek-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ell_Grek': {'number_of_characters': 79210, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.32, 'max_document_length': 212, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'est_Latn-est_Latn': {'number_of_characters': 61779, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.6, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'est_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-est_Latn': {'number_of_characters': 61779, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.6, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eus_Latn-eus_Latn': {'number_of_characters': 67979, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 137.3, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eus_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-eus_Latn': {'number_of_characters': 67979, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 137.3, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fin_Latn-fin_Latn': {'number_of_characters': 66234, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-fin_Latn': {'number_of_characters': 66234, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fra_Latn-fra_Latn': {'number_of_characters': 82464, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 166.98, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fra_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-fra_Latn': {'number_of_characters': 82464, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 166.98, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fuv_Latn-fuv_Latn': {'number_of_characters': 53555, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 107.74, 'max_document_length': 122, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fuv_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-fuv_Latn': {'number_of_characters': 53555, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 107.74, 'max_document_length': 122, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'gaz_Latn-gaz_Latn': {'number_of_characters': 78315, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 158.48, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'gaz_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-gaz_Latn': {'number_of_characters': 78315, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 158.48, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'grn_Latn-grn_Latn': {'number_of_characters': 68572, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 138.52, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'grn_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-grn_Latn': {'number_of_characters': 68572, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 138.52, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'guj_Gujr-guj_Gujr': {'number_of_characters': 57007, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 114.82, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'guj_Gujr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-guj_Gujr': {'number_of_characters': 57007, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 114.82, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hat_Latn-hat_Latn': {'number_of_characters': 64558, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.29, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hat_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hat_Latn': {'number_of_characters': 64558, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.29, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hau_Latn-hau_Latn': {'number_of_characters': 78240, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.33, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hau_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hau_Latn': {'number_of_characters': 78240, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.33, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'heb_Hebr-heb_Hebr': {'number_of_characters': 50598, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 101.68, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'heb_Hebr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-heb_Hebr': {'number_of_characters': 50598, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 101.68, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Deva-hin_Deva': {'number_of_characters': 66332, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.93, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Deva-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hin_Deva': {'number_of_characters': 66332, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.93, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Latn-hin_Latn': {'number_of_characters': 68307, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.97, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hin_Latn': {'number_of_characters': 68307, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.97, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hrv_Latn-hrv_Latn': {'number_of_characters': 62928, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.95, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hrv_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hrv_Latn': {'number_of_characters': 62928, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.95, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hun_Latn-hun_Latn': {'number_of_characters': 67941, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 137.22, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hun_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hun_Latn': {'number_of_characters': 67941, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 137.22, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hye_Armn-hye_Armn': {'number_of_characters': 68859, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.1, 'max_document_length': 193, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hye_Armn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hye_Armn': {'number_of_characters': 68859, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.1, 'max_document_length': 193, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ibo_Latn-ibo_Latn': {'number_of_characters': 66167, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 133.59, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'ibo_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ibo_Latn': {'number_of_characters': 66167, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 133.59, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'ilo_Latn-ilo_Latn': {'number_of_characters': 78161, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.17, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ilo_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ilo_Latn': {'number_of_characters': 78161, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.17, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ind_Latn-ind_Latn': {'number_of_characters': 74871, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 151.42, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ind_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ind_Latn': {'number_of_characters': 74871, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 151.42, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'isl_Latn-isl_Latn': {'number_of_characters': 70522, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 142.51, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'isl_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-isl_Latn': {'number_of_characters': 70522, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 142.51, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ita_Latn-ita_Latn': {'number_of_characters': 76124, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 153.99, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ita_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ita_Latn': {'number_of_characters': 76124, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 153.99, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jav_Latn-jav_Latn': {'number_of_characters': 71722, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 144.97, 'max_document_length': 174, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jav_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-jav_Latn': {'number_of_characters': 71722, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 144.97, 'max_document_length': 174, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jpn_Jpan-jpn_Jpan': {'number_of_characters': 33187, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 66.01, 'max_document_length': 76, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jpn_Jpan-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-jpn_Jpan': {'number_of_characters': 33187, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 66.01, 'max_document_length': 76, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kac_Latn-kac_Latn': {'number_of_characters': 89655, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 181.72, 'max_document_length': 195, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kac_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kac_Latn': {'number_of_characters': 89655, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 181.72, 'max_document_length': 195, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kan_Knda-kan_Knda': {'number_of_characters': 65899, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.04, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kan_Knda-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kan_Knda': {'number_of_characters': 65899, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.04, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kat_Geor-kat_Geor': {'number_of_characters': 68309, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.98, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kat_Geor-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kat_Geor': {'number_of_characters': 68309, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.98, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kaz_Cyrl-kaz_Cyrl': {'number_of_characters': 64657, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.49, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kaz_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kaz_Cyrl': {'number_of_characters': 64657, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.49, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kea_Latn-kea_Latn': {'number_of_characters': 69323, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.06, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kea_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kea_Latn': {'number_of_characters': 69323, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.06, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khk_Cyrl-khk_Cyrl': {'number_of_characters': 66977, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 135.25, 'max_document_length': 162, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khk_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-khk_Cyrl': {'number_of_characters': 66977, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 135.25, 'max_document_length': 162, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khm_Khmr-khm_Khmr': {'number_of_characters': 69150, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 139.7, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khm_Khmr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-khm_Khmr': {'number_of_characters': 69150, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 139.7, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kin_Latn-kin_Latn': {'number_of_characters': 72803, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 147.19, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'kin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kin_Latn': {'number_of_characters': 72803, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 147.19, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'kir_Cyrl-kir_Cyrl': {'number_of_characters': 67957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 137.26, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kir_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kir_Cyrl': {'number_of_characters': 67957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 137.26, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kor_Hang-kor_Hang': {'number_of_characters': 32708, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 65.02, 'max_document_length': 88, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kor_Hang-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kor_Hang': {'number_of_characters': 32708, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 65.02, 'max_document_length': 88, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lao_Laoo-lao_Laoo': {'number_of_characters': 57958, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 116.77, 'max_document_length': 142, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lao_Laoo-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lao_Laoo': {'number_of_characters': 57958, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 116.77, 'max_document_length': 142, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lin_Latn-lin_Latn': {'number_of_characters': 74223, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 150.1, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'lin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lin_Latn': {'number_of_characters': 74223, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 150.1, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'lit_Latn-lit_Latn': {'number_of_characters': 62805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 126.7, 'max_document_length': 167, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lit_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lit_Latn': {'number_of_characters': 62805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 126.7, 'max_document_length': 167, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lug_Latn-lug_Latn': {'number_of_characters': 71566, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 144.65, 'max_document_length': 237, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'lug_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lug_Latn': {'number_of_characters': 71566, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 144.65, 'max_document_length': 237, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'luo_Latn-luo_Latn': {'number_of_characters': 66805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 134.9, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'luo_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-luo_Latn': {'number_of_characters': 66805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 134.9, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lvs_Latn-lvs_Latn': {'number_of_characters': 63957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 129.06, 'max_document_length': 172, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lvs_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lvs_Latn': {'number_of_characters': 63957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 129.06, 'max_document_length': 172, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mal_Mlym-mal_Mlym': {'number_of_characters': 73599, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.82, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mal_Mlym-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mal_Mlym': {'number_of_characters': 73599, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.82, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mar_Deva-mar_Deva': {'number_of_characters': 62671, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 126.42, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'mar_Deva-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mar_Deva': {'number_of_characters': 62671, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 126.42, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'mkd_Cyrl-mkd_Cyrl': {'number_of_characters': 67588, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 136.5, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mkd_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mkd_Cyrl': {'number_of_characters': 67588, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 136.5, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mlt_Latn-mlt_Latn': {'number_of_characters': 68480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 138.33, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mlt_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mlt_Latn': {'number_of_characters': 68480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 138.33, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mri_Latn-mri_Latn': {'number_of_characters': 74519, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 150.7, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mri_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mri_Latn': {'number_of_characters': 74519, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 150.7, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mya_Mymr-mya_Mymr': {'number_of_characters': 81331, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 164.66, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mya_Mymr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mya_Mymr': {'number_of_characters': 81331, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 164.66, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nld_Latn-nld_Latn': {'number_of_characters': 68789, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 138.96, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nld_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nld_Latn': {'number_of_characters': 68789, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 138.96, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nob_Latn-nob_Latn': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 131.03, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nob_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nob_Latn': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 131.03, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Deva-npi_Deva': {'number_of_characters': 61183, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 123.38, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Deva-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-npi_Deva': {'number_of_characters': 61183, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 123.38, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Latn-npi_Latn': {'number_of_characters': 65683, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 132.6, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-npi_Latn': {'number_of_characters': 65683, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 132.6, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nso_Latn-nso_Latn': {'number_of_characters': 79073, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 160.03, 'max_document_length': 235, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nso_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nso_Latn': {'number_of_characters': 79073, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 160.03, 'max_document_length': 235, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nya_Latn-nya_Latn': {'number_of_characters': 82685, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.44, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nya_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nya_Latn': {'number_of_characters': 82685, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.44, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ory_Orya-ory_Orya': {'number_of_characters': 66638, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 10, 'average_document_length': 134.55, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ory_Orya-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ory_Orya': {'number_of_characters': 66638, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 10, 'average_document_length': 134.55, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pan_Guru-pan_Guru': {'number_of_characters': 66944, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.18, 'max_document_length': 157, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pan_Guru-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pan_Guru': {'number_of_characters': 66944, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.18, 'max_document_length': 157, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pbt_Arab-pbt_Arab': {'number_of_characters': 61880, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 124.8, 'max_document_length': 155, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pbt_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pbt_Arab': {'number_of_characters': 61880, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 124.8, 'max_document_length': 155, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pes_Arab-pes_Arab': {'number_of_characters': 59252, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 119.42, 'max_document_length': 152, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pes_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pes_Arab': {'number_of_characters': 59252, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 119.42, 'max_document_length': 152, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'plt_Latn-plt_Latn': {'number_of_characters': 86472, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 175.2, 'max_document_length': 222, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'plt_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-plt_Latn': {'number_of_characters': 86472, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 175.2, 'max_document_length': 222, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pol_Latn-pol_Latn': {'number_of_characters': 67664, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 136.66, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pol_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pol_Latn': {'number_of_characters': 67664, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 136.66, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'por_Latn-por_Latn': {'number_of_characters': 71281, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.07, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'por_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-por_Latn': {'number_of_characters': 71281, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.07, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ron_Latn-ron_Latn': {'number_of_characters': 71844, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 145.22, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ron_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ron_Latn': {'number_of_characters': 71844, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 145.22, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'rus_Cyrl-rus_Cyrl': {'number_of_characters': 75823, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 153.38, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'rus_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-rus_Cyrl': {'number_of_characters': 75823, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 153.38, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'shn_Mymr-shn_Mymr': {'number_of_characters': 69288, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 139.98, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'shn_Mymr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-shn_Mymr': {'number_of_characters': 69288, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 139.98, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Latn-sin_Latn': {'number_of_characters': 85996, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 174.22, 'max_document_length': 224, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sin_Latn': {'number_of_characters': 85996, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 174.22, 'max_document_length': 224, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Sinh-sin_Sinh': {'number_of_characters': 63902, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 128.95, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Sinh-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sin_Sinh': {'number_of_characters': 63902, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 128.95, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slk_Latn-slk_Latn': {'number_of_characters': 62663, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 126.41, 'max_document_length': 146, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slk_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-slk_Latn': {'number_of_characters': 62663, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 126.41, 'max_document_length': 146, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slv_Latn-slv_Latn': {'number_of_characters': 62895, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.88, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slv_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-slv_Latn': {'number_of_characters': 62895, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.88, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sna_Latn-sna_Latn': {'number_of_characters': 74071, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.78, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sna_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sna_Latn': {'number_of_characters': 74071, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.78, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'snd_Arab-snd_Arab': {'number_of_characters': 58057, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 116.97, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'snd_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-snd_Arab': {'number_of_characters': 58057, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 116.97, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'som_Latn-som_Latn': {'number_of_characters': 82838, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.75, 'max_document_length': 201, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'som_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-som_Latn': {'number_of_characters': 82838, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.75, 'max_document_length': 201, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sot_Latn-sot_Latn': {'number_of_characters': 75794, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 153.32, 'max_document_length': 186, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sot_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sot_Latn': {'number_of_characters': 75794, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 153.32, 'max_document_length': 186, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'spa_Latn-spa_Latn': {'number_of_characters': 74920, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 151.52, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'spa_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-spa_Latn': {'number_of_characters': 74920, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 151.52, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'srp_Cyrl-srp_Cyrl': {'number_of_characters': 61657, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.35, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'srp_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-srp_Cyrl': {'number_of_characters': 61657, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.35, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'ssw_Latn-ssw_Latn': {'number_of_characters': 73964, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 149.57, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ssw_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ssw_Latn': {'number_of_characters': 73964, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 149.57, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sun_Latn-sun_Latn': {'number_of_characters': 71320, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 144.15, 'max_document_length': 173, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sun_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sun_Latn': {'number_of_characters': 71320, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 144.15, 'max_document_length': 173, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swe_Latn-swe_Latn': {'number_of_characters': 62785, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 126.66, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swe_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-swe_Latn': {'number_of_characters': 62785, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 126.66, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swh_Latn-swh_Latn': {'number_of_characters': 73480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.57, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swh_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-swh_Latn': {'number_of_characters': 73480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.57, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tam_Taml-tam_Taml': {'number_of_characters': 73991, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.62, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tam_Taml-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tam_Taml': {'number_of_characters': 73991, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.62, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tel_Telu-tel_Telu': {'number_of_characters': 65945, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 133.13, 'max_document_length': 149, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tel_Telu-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tel_Telu': {'number_of_characters': 65945, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 133.13, 'max_document_length': 149, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgk_Cyrl-tgk_Cyrl': {'number_of_characters': 67829, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 136.99, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgk_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tgk_Cyrl': {'number_of_characters': 67829, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 136.99, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgl_Latn-tgl_Latn': {'number_of_characters': 75087, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 151.87, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgl_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tgl_Latn': {'number_of_characters': 75087, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 151.87, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tha_Thai-tha_Thai': {'number_of_characters': 54496, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 109.67, 'max_document_length': 123, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tha_Thai-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tha_Thai': {'number_of_characters': 54496, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 109.67, 'max_document_length': 123, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tir_Ethi-tir_Ethi': {'number_of_characters': 47775, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 95.9, 'max_document_length': 110, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tir_Ethi-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tir_Ethi': {'number_of_characters': 47775, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 95.9, 'max_document_length': 110, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tsn_Latn-tsn_Latn': {'number_of_characters': 79391, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 160.69, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tsn_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tsn_Latn': {'number_of_characters': 79391, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 160.69, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tso_Latn-tso_Latn': {'number_of_characters': 83501, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 169.11, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tso_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tso_Latn': {'number_of_characters': 83501, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 169.11, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tur_Latn-tur_Latn': {'number_of_characters': 65382, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 131.98, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tur_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tur_Latn': {'number_of_characters': 65382, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 131.98, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ukr_Cyrl-ukr_Cyrl': {'number_of_characters': 65850, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 132.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ukr_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ukr_Cyrl': {'number_of_characters': 65850, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 132.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Arab-urd_Arab': {'number_of_characters': 64450, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 130.07, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-urd_Arab': {'number_of_characters': 64450, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 130.07, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Latn-urd_Latn': {'number_of_characters': 82039, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 166.11, 'max_document_length': 230, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-urd_Latn': {'number_of_characters': 82039, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 166.11, 'max_document_length': 230, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'uzn_Latn-uzn_Latn': {'number_of_characters': 70828, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 143.14, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'uzn_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-uzn_Latn': {'number_of_characters': 70828, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 143.14, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'vie_Latn-vie_Latn': {'number_of_characters': 66724, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 134.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'vie_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-vie_Latn': {'number_of_characters': 66724, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 134.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'war_Latn-war_Latn': {'number_of_characters': 78444, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 158.75, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'war_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-war_Latn': {'number_of_characters': 78444, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 158.75, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'wol_Latn-wol_Latn': {'number_of_characters': 64521, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 130.22, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'wol_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-wol_Latn': {'number_of_characters': 64521, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 130.22, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'xho_Latn-xho_Latn': {'number_of_characters': 71629, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.78, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'xho_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-xho_Latn': {'number_of_characters': 71629, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.78, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'yor_Latn-yor_Latn': {'number_of_characters': 62752, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 126.59, 'max_document_length': 143, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'yor_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-yor_Latn': {'number_of_characters': 62752, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 126.59, 'max_document_length': 143, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hans-zho_Hans': {'number_of_characters': 20549, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 40.11, 'max_document_length': 64, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hans-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zho_Hans': {'number_of_characters': 20549, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 40.11, 'max_document_length': 64, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hant-zho_Hant': {'number_of_characters': 19947, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 38.88, 'max_document_length': 45, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hant-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zho_Hant': {'number_of_characters': 19947, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 38.88, 'max_document_length': 45, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zsm_Latn-zsm_Latn': {'number_of_characters': 72008, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 145.56, 'max_document_length': 210, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zsm_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zsm_Latn': {'number_of_characters': 72008, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 145.56, 'max_document_length': 210, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zul_Latn-zul_Latn': {'number_of_characters': 69413, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.24, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zul_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zul_Latn': {'number_of_characters': 69413, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.24, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Arab-arb_Latn': {'number_of_characters': 61298, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 123.61, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Latn-arb_Arab': {'number_of_characters': 53671, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 107.98, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Beng-ben_Latn': {'number_of_characters': 68285, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 137.93, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Latn-ben_Beng': {'number_of_characters': 63512, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 128.15, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Deva-hin_Latn': {'number_of_characters': 68307, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.97, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Latn-hin_Deva': {'number_of_characters': 66332, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.93, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Deva-npi_Latn': {'number_of_characters': 65683, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 132.6, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Latn-npi_Deva': {'number_of_characters': 61183, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 123.38, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Sinh-sin_Latn': {'number_of_characters': 85996, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 174.22, 'max_document_length': 224, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Latn-sin_Sinh': {'number_of_characters': 63902, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 128.95, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Arab-urd_Latn': {'number_of_characters': 82039, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 166.11, 'max_document_length': 230, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Latn-urd_Arab': {'number_of_characters': 64450, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 130.07, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}}}} | +| [BelebeleRetrieval](https://arxiv.org/abs/2308.16884) (Lucas Bandarkar, 2023) | ['acm', 'afr', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'azj', 'bam', 'ben', 'bod', 'bul', 'cat', 'ceb', 'ces', 'ckb', 'dan', 'deu', 'ell', 'eng', 'est', 'eus', 'fin', 'fra', 'fuv', 'gaz', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kac', 'kan', 'kat', 'kaz', 'kea', 'khk', 'khm', 'kin', 'kir', 'kor', 'lao', 'lin', 'lit', 'lug', 'luo', 'lvs', 'mal', 'mar', 'mkd', 'mlt', 'mri', 'mya', 'nld', 'nob', 'npi', 'nso', 'nya', 'ory', 'pan', 'pbt', 'pes', 'plt', 'pol', 'por', 'ron', 'rus', 'shn', 'sin', 'slk', 'slv', 'sna', 'snd', 'som', 'sot', 'spa', 'srp', 'ssw', 'sun', 'swe', 'swh', 'tam', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tsn', 'tso', 'tur', 'ukr', 'urd', 'uzn', 'vie', 'war', 'wol', 'xho', 'yor', 'zho', 'zsm', 'zul'] | Retrieval | s2p | [News, Web, Written] | {'test': 521866} | {'test': {'number_of_characters': 25574620, 'num_samples': 521866, 'num_queries': 338378, 'num_documents': 183488, 'min_document_length': 4, 'average_document_length': 137.38, 'max_document_length': 237, 'unique_documents': 183488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 338378, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 183488, 'hf_subset_descriptive_stats': {'acm_Arab-acm_Arab': {'number_of_characters': 51232, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 102.98, 'max_document_length': 129, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'acm_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-acm_Arab': {'number_of_characters': 51232, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 102.98, 'max_document_length': 129, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'afr_Latn-afr_Latn': {'number_of_characters': 71217, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 143.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'afr_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-afr_Latn': {'number_of_characters': 71217, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 143.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'als_Latn-als_Latn': {'number_of_characters': 69498, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 140.41, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'als_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-als_Latn': {'number_of_characters': 69498, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 140.41, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'amh_Ethi-amh_Ethi': {'number_of_characters': 45221, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 90.67, 'max_document_length': 100, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'amh_Ethi-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-amh_Ethi': {'number_of_characters': 45221, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 90.67, 'max_document_length': 100, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'apc_Arab-apc_Arab': {'number_of_characters': 51248, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 103.02, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'apc_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-apc_Arab': {'number_of_characters': 51248, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 103.02, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Arab-arb_Arab': {'number_of_characters': 53671, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 107.98, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-arb_Arab': {'number_of_characters': 53671, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 107.98, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Latn-arb_Latn': {'number_of_characters': 61298, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 123.61, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-arb_Latn': {'number_of_characters': 61298, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 123.61, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ars_Arab-ars_Arab': {'number_of_characters': 51765, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 104.08, 'max_document_length': 119, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ars_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ars_Arab': {'number_of_characters': 51765, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 104.08, 'max_document_length': 119, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ary_Arab-ary_Arab': {'number_of_characters': 60261, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 121.49, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ary_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ary_Arab': {'number_of_characters': 60261, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 121.49, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arz_Arab-arz_Arab': {'number_of_characters': 52403, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 105.38, 'max_document_length': 115, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arz_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-arz_Arab': {'number_of_characters': 52403, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 105.38, 'max_document_length': 115, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'asm_Beng-asm_Beng': {'number_of_characters': 62410, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 4, 'average_document_length': 125.89, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'asm_Beng-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-asm_Beng': {'number_of_characters': 62410, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 4, 'average_document_length': 125.89, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'azj_Latn-azj_Latn': {'number_of_characters': 67137, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.58, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'azj_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-azj_Latn': {'number_of_characters': 67137, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.58, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bam_Latn-bam_Latn': {'number_of_characters': 66084, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 133.42, 'max_document_length': 166, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bam_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-bam_Latn': {'number_of_characters': 66084, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 133.42, 'max_document_length': 166, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Beng-ben_Beng': {'number_of_characters': 63512, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 128.15, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Beng-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ben_Beng': {'number_of_characters': 63512, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 128.15, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Latn-ben_Latn': {'number_of_characters': 68285, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 137.93, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ben_Latn': {'number_of_characters': 68285, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 137.93, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bod_Tibt-bod_Tibt': {'number_of_characters': 79188, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.27, 'max_document_length': 213, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bod_Tibt-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-bod_Tibt': {'number_of_characters': 79188, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.27, 'max_document_length': 213, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bul_Cyrl-bul_Cyrl': {'number_of_characters': 66577, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.43, 'max_document_length': 177, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bul_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-bul_Cyrl': {'number_of_characters': 66577, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.43, 'max_document_length': 177, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'cat_Latn-cat_Latn': {'number_of_characters': 68842, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.07, 'max_document_length': 163, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'cat_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-cat_Latn': {'number_of_characters': 68842, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.07, 'max_document_length': 163, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ceb_Latn-ceb_Latn': {'number_of_characters': 74053, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 149.75, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ceb_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ceb_Latn': {'number_of_characters': 74053, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 149.75, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ces_Latn-ces_Latn': {'number_of_characters': 61936, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 124.92, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ces_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ces_Latn': {'number_of_characters': 61936, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 124.92, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ckb_Arab-ckb_Arab': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 131.03, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ckb_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ckb_Arab': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 131.03, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'dan_Latn-dan_Latn': {'number_of_characters': 66648, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.57, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'dan_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-dan_Latn': {'number_of_characters': 66648, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.57, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'deu_Latn-deu_Latn': {'number_of_characters': 68768, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 138.92, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'deu_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-deu_Latn': {'number_of_characters': 68768, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 138.92, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ell_Grek-ell_Grek': {'number_of_characters': 79210, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.32, 'max_document_length': 212, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ell_Grek-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ell_Grek': {'number_of_characters': 79210, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.32, 'max_document_length': 212, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'est_Latn-est_Latn': {'number_of_characters': 61779, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.6, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'est_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-est_Latn': {'number_of_characters': 61779, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.6, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eus_Latn-eus_Latn': {'number_of_characters': 67979, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 137.3, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eus_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-eus_Latn': {'number_of_characters': 67979, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 137.3, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fin_Latn-fin_Latn': {'number_of_characters': 66234, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-fin_Latn': {'number_of_characters': 66234, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fra_Latn-fra_Latn': {'number_of_characters': 82464, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 166.98, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fra_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-fra_Latn': {'number_of_characters': 82464, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 166.98, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fuv_Latn-fuv_Latn': {'number_of_characters': 53555, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 107.74, 'max_document_length': 122, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fuv_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-fuv_Latn': {'number_of_characters': 53555, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 107.74, 'max_document_length': 122, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'gaz_Latn-gaz_Latn': {'number_of_characters': 78315, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 158.48, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'gaz_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-gaz_Latn': {'number_of_characters': 78315, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 158.48, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'grn_Latn-grn_Latn': {'number_of_characters': 68572, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 138.52, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'grn_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-grn_Latn': {'number_of_characters': 68572, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 138.52, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'guj_Gujr-guj_Gujr': {'number_of_characters': 57007, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 114.82, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'guj_Gujr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-guj_Gujr': {'number_of_characters': 57007, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 114.82, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hat_Latn-hat_Latn': {'number_of_characters': 64558, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.29, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hat_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hat_Latn': {'number_of_characters': 64558, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.29, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hau_Latn-hau_Latn': {'number_of_characters': 78240, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.33, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hau_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hau_Latn': {'number_of_characters': 78240, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.33, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'heb_Hebr-heb_Hebr': {'number_of_characters': 50598, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 101.68, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'heb_Hebr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-heb_Hebr': {'number_of_characters': 50598, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 101.68, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Deva-hin_Deva': {'number_of_characters': 66332, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.93, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Deva-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hin_Deva': {'number_of_characters': 66332, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.93, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Latn-hin_Latn': {'number_of_characters': 68307, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.97, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hin_Latn': {'number_of_characters': 68307, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.97, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hrv_Latn-hrv_Latn': {'number_of_characters': 62928, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.95, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hrv_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hrv_Latn': {'number_of_characters': 62928, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.95, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hun_Latn-hun_Latn': {'number_of_characters': 67941, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 137.22, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hun_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hun_Latn': {'number_of_characters': 67941, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 137.22, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hye_Armn-hye_Armn': {'number_of_characters': 68859, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.1, 'max_document_length': 193, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hye_Armn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hye_Armn': {'number_of_characters': 68859, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.1, 'max_document_length': 193, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ibo_Latn-ibo_Latn': {'number_of_characters': 66167, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 133.59, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'ibo_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ibo_Latn': {'number_of_characters': 66167, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 133.59, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'ilo_Latn-ilo_Latn': {'number_of_characters': 78161, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.17, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ilo_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ilo_Latn': {'number_of_characters': 78161, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.17, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ind_Latn-ind_Latn': {'number_of_characters': 74871, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 151.42, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ind_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ind_Latn': {'number_of_characters': 74871, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 151.42, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'isl_Latn-isl_Latn': {'number_of_characters': 70522, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 142.51, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'isl_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-isl_Latn': {'number_of_characters': 70522, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 142.51, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ita_Latn-ita_Latn': {'number_of_characters': 76124, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 153.99, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ita_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ita_Latn': {'number_of_characters': 76124, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 153.99, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jav_Latn-jav_Latn': {'number_of_characters': 71722, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 144.97, 'max_document_length': 174, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jav_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-jav_Latn': {'number_of_characters': 71722, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 144.97, 'max_document_length': 174, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jpn_Jpan-jpn_Jpan': {'number_of_characters': 33187, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 66.01, 'max_document_length': 76, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jpn_Jpan-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-jpn_Jpan': {'number_of_characters': 33187, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 66.01, 'max_document_length': 76, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kac_Latn-kac_Latn': {'number_of_characters': 89655, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 181.72, 'max_document_length': 195, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kac_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kac_Latn': {'number_of_characters': 89655, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 181.72, 'max_document_length': 195, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kan_Knda-kan_Knda': {'number_of_characters': 65899, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.04, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kan_Knda-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kan_Knda': {'number_of_characters': 65899, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.04, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kat_Geor-kat_Geor': {'number_of_characters': 68309, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.98, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kat_Geor-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kat_Geor': {'number_of_characters': 68309, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.98, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kaz_Cyrl-kaz_Cyrl': {'number_of_characters': 64657, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.49, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kaz_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kaz_Cyrl': {'number_of_characters': 64657, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.49, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kea_Latn-kea_Latn': {'number_of_characters': 69323, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.06, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kea_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kea_Latn': {'number_of_characters': 69323, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.06, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khk_Cyrl-khk_Cyrl': {'number_of_characters': 66977, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 135.25, 'max_document_length': 162, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khk_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-khk_Cyrl': {'number_of_characters': 66977, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 135.25, 'max_document_length': 162, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khm_Khmr-khm_Khmr': {'number_of_characters': 69150, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 139.7, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khm_Khmr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-khm_Khmr': {'number_of_characters': 69150, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 139.7, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kin_Latn-kin_Latn': {'number_of_characters': 72803, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 147.19, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'kin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kin_Latn': {'number_of_characters': 72803, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 147.19, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'kir_Cyrl-kir_Cyrl': {'number_of_characters': 67957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 137.26, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kir_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kir_Cyrl': {'number_of_characters': 67957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 137.26, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kor_Hang-kor_Hang': {'number_of_characters': 32708, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 65.02, 'max_document_length': 88, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kor_Hang-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kor_Hang': {'number_of_characters': 32708, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 65.02, 'max_document_length': 88, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lao_Laoo-lao_Laoo': {'number_of_characters': 57958, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 116.77, 'max_document_length': 142, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lao_Laoo-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lao_Laoo': {'number_of_characters': 57958, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 116.77, 'max_document_length': 142, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lin_Latn-lin_Latn': {'number_of_characters': 74223, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 150.1, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'lin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lin_Latn': {'number_of_characters': 74223, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 150.1, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'lit_Latn-lit_Latn': {'number_of_characters': 62805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 126.7, 'max_document_length': 167, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lit_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lit_Latn': {'number_of_characters': 62805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 126.7, 'max_document_length': 167, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lug_Latn-lug_Latn': {'number_of_characters': 71566, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 144.65, 'max_document_length': 237, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'lug_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lug_Latn': {'number_of_characters': 71566, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 144.65, 'max_document_length': 237, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'luo_Latn-luo_Latn': {'number_of_characters': 66805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 134.9, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'luo_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-luo_Latn': {'number_of_characters': 66805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 134.9, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lvs_Latn-lvs_Latn': {'number_of_characters': 63957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 129.06, 'max_document_length': 172, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lvs_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lvs_Latn': {'number_of_characters': 63957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 129.06, 'max_document_length': 172, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mal_Mlym-mal_Mlym': {'number_of_characters': 73599, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.82, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mal_Mlym-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mal_Mlym': {'number_of_characters': 73599, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.82, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mar_Deva-mar_Deva': {'number_of_characters': 62671, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 126.42, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'mar_Deva-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mar_Deva': {'number_of_characters': 62671, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 126.42, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'mkd_Cyrl-mkd_Cyrl': {'number_of_characters': 67588, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 136.5, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mkd_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mkd_Cyrl': {'number_of_characters': 67588, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 136.5, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mlt_Latn-mlt_Latn': {'number_of_characters': 68480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 138.33, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mlt_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mlt_Latn': {'number_of_characters': 68480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 138.33, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mri_Latn-mri_Latn': {'number_of_characters': 74519, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 150.7, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mri_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mri_Latn': {'number_of_characters': 74519, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 150.7, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mya_Mymr-mya_Mymr': {'number_of_characters': 81331, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 164.66, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mya_Mymr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mya_Mymr': {'number_of_characters': 81331, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 164.66, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nld_Latn-nld_Latn': {'number_of_characters': 68789, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 138.96, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nld_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nld_Latn': {'number_of_characters': 68789, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 138.96, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nob_Latn-nob_Latn': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 131.03, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nob_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nob_Latn': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 131.03, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Deva-npi_Deva': {'number_of_characters': 61183, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 123.38, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Deva-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-npi_Deva': {'number_of_characters': 61183, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 123.38, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Latn-npi_Latn': {'number_of_characters': 65683, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 132.6, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-npi_Latn': {'number_of_characters': 65683, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 132.6, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nso_Latn-nso_Latn': {'number_of_characters': 79073, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 160.03, 'max_document_length': 235, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nso_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nso_Latn': {'number_of_characters': 79073, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 160.03, 'max_document_length': 235, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nya_Latn-nya_Latn': {'number_of_characters': 82685, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.44, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nya_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nya_Latn': {'number_of_characters': 82685, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.44, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ory_Orya-ory_Orya': {'number_of_characters': 66638, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 10, 'average_document_length': 134.55, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ory_Orya-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ory_Orya': {'number_of_characters': 66638, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 10, 'average_document_length': 134.55, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pan_Guru-pan_Guru': {'number_of_characters': 66944, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.18, 'max_document_length': 157, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pan_Guru-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pan_Guru': {'number_of_characters': 66944, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.18, 'max_document_length': 157, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pbt_Arab-pbt_Arab': {'number_of_characters': 61880, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 124.8, 'max_document_length': 155, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pbt_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pbt_Arab': {'number_of_characters': 61880, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 124.8, 'max_document_length': 155, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pes_Arab-pes_Arab': {'number_of_characters': 59252, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 119.42, 'max_document_length': 152, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pes_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pes_Arab': {'number_of_characters': 59252, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 119.42, 'max_document_length': 152, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'plt_Latn-plt_Latn': {'number_of_characters': 86472, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 175.2, 'max_document_length': 222, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'plt_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-plt_Latn': {'number_of_characters': 86472, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 175.2, 'max_document_length': 222, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pol_Latn-pol_Latn': {'number_of_characters': 67664, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 136.66, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pol_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pol_Latn': {'number_of_characters': 67664, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 136.66, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'por_Latn-por_Latn': {'number_of_characters': 71281, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.07, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'por_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-por_Latn': {'number_of_characters': 71281, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.07, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ron_Latn-ron_Latn': {'number_of_characters': 71844, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 145.22, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ron_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ron_Latn': {'number_of_characters': 71844, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 145.22, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'rus_Cyrl-rus_Cyrl': {'number_of_characters': 75823, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 153.38, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'rus_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-rus_Cyrl': {'number_of_characters': 75823, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 153.38, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'shn_Mymr-shn_Mymr': {'number_of_characters': 69288, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 139.98, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'shn_Mymr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-shn_Mymr': {'number_of_characters': 69288, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 139.98, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Latn-sin_Latn': {'number_of_characters': 85996, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 174.22, 'max_document_length': 224, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sin_Latn': {'number_of_characters': 85996, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 174.22, 'max_document_length': 224, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Sinh-sin_Sinh': {'number_of_characters': 63902, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 128.95, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Sinh-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sin_Sinh': {'number_of_characters': 63902, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 128.95, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slk_Latn-slk_Latn': {'number_of_characters': 62663, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 126.41, 'max_document_length': 146, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slk_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-slk_Latn': {'number_of_characters': 62663, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 126.41, 'max_document_length': 146, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slv_Latn-slv_Latn': {'number_of_characters': 62895, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.88, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slv_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-slv_Latn': {'number_of_characters': 62895, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.88, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sna_Latn-sna_Latn': {'number_of_characters': 74071, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.78, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sna_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sna_Latn': {'number_of_characters': 74071, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.78, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'snd_Arab-snd_Arab': {'number_of_characters': 58057, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 116.97, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'snd_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-snd_Arab': {'number_of_characters': 58057, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 116.97, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'som_Latn-som_Latn': {'number_of_characters': 82838, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.75, 'max_document_length': 201, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'som_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-som_Latn': {'number_of_characters': 82838, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.75, 'max_document_length': 201, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sot_Latn-sot_Latn': {'number_of_characters': 75794, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 153.32, 'max_document_length': 186, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sot_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sot_Latn': {'number_of_characters': 75794, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 153.32, 'max_document_length': 186, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'spa_Latn-spa_Latn': {'number_of_characters': 74920, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 151.52, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'spa_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-spa_Latn': {'number_of_characters': 74920, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 151.52, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'srp_Cyrl-srp_Cyrl': {'number_of_characters': 61657, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.35, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'srp_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-srp_Cyrl': {'number_of_characters': 61657, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.35, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'ssw_Latn-ssw_Latn': {'number_of_characters': 73964, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 149.57, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ssw_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ssw_Latn': {'number_of_characters': 73964, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 149.57, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sun_Latn-sun_Latn': {'number_of_characters': 71320, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 144.15, 'max_document_length': 173, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sun_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sun_Latn': {'number_of_characters': 71320, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 144.15, 'max_document_length': 173, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swe_Latn-swe_Latn': {'number_of_characters': 62785, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 126.66, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swe_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-swe_Latn': {'number_of_characters': 62785, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 126.66, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swh_Latn-swh_Latn': {'number_of_characters': 73480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.57, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swh_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-swh_Latn': {'number_of_characters': 73480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.57, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tam_Taml-tam_Taml': {'number_of_characters': 73991, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.62, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tam_Taml-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tam_Taml': {'number_of_characters': 73991, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.62, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tel_Telu-tel_Telu': {'number_of_characters': 65945, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 133.13, 'max_document_length': 149, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tel_Telu-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tel_Telu': {'number_of_characters': 65945, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 133.13, 'max_document_length': 149, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgk_Cyrl-tgk_Cyrl': {'number_of_characters': 67829, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 136.99, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgk_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tgk_Cyrl': {'number_of_characters': 67829, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 136.99, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgl_Latn-tgl_Latn': {'number_of_characters': 75087, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 151.87, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgl_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tgl_Latn': {'number_of_characters': 75087, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 151.87, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tha_Thai-tha_Thai': {'number_of_characters': 54496, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 109.67, 'max_document_length': 123, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tha_Thai-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tha_Thai': {'number_of_characters': 54496, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 109.67, 'max_document_length': 123, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tir_Ethi-tir_Ethi': {'number_of_characters': 47775, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 95.9, 'max_document_length': 110, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tir_Ethi-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tir_Ethi': {'number_of_characters': 47775, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 95.9, 'max_document_length': 110, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tsn_Latn-tsn_Latn': {'number_of_characters': 79391, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 160.69, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tsn_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tsn_Latn': {'number_of_characters': 79391, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 160.69, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tso_Latn-tso_Latn': {'number_of_characters': 83501, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 169.11, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tso_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tso_Latn': {'number_of_characters': 83501, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 169.11, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tur_Latn-tur_Latn': {'number_of_characters': 65382, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 131.98, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tur_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tur_Latn': {'number_of_characters': 65382, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 131.98, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ukr_Cyrl-ukr_Cyrl': {'number_of_characters': 65850, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 132.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ukr_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ukr_Cyrl': {'number_of_characters': 65850, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 132.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Arab-urd_Arab': {'number_of_characters': 64450, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 130.07, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-urd_Arab': {'number_of_characters': 64450, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 130.07, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Latn-urd_Latn': {'number_of_characters': 82039, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 166.11, 'max_document_length': 230, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-urd_Latn': {'number_of_characters': 82039, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 166.11, 'max_document_length': 230, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'uzn_Latn-uzn_Latn': {'number_of_characters': 70828, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 143.14, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'uzn_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-uzn_Latn': {'number_of_characters': 70828, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 143.14, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'vie_Latn-vie_Latn': {'number_of_characters': 66724, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 134.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'vie_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-vie_Latn': {'number_of_characters': 66724, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 134.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'war_Latn-war_Latn': {'number_of_characters': 78444, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 158.75, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'war_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-war_Latn': {'number_of_characters': 78444, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 158.75, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'wol_Latn-wol_Latn': {'number_of_characters': 64521, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 130.22, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'wol_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-wol_Latn': {'number_of_characters': 64521, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 130.22, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'xho_Latn-xho_Latn': {'number_of_characters': 71629, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.78, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'xho_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-xho_Latn': {'number_of_characters': 71629, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.78, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'yor_Latn-yor_Latn': {'number_of_characters': 62752, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 126.59, 'max_document_length': 143, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'yor_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-yor_Latn': {'number_of_characters': 62752, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 126.59, 'max_document_length': 143, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hans-zho_Hans': {'number_of_characters': 20549, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 40.11, 'max_document_length': 64, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hans-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zho_Hans': {'number_of_characters': 20549, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 40.11, 'max_document_length': 64, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hant-zho_Hant': {'number_of_characters': 19947, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 38.88, 'max_document_length': 45, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hant-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zho_Hant': {'number_of_characters': 19947, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 38.88, 'max_document_length': 45, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zsm_Latn-zsm_Latn': {'number_of_characters': 72008, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 145.56, 'max_document_length': 210, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zsm_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zsm_Latn': {'number_of_characters': 72008, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 145.56, 'max_document_length': 210, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zul_Latn-zul_Latn': {'number_of_characters': 69413, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.24, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zul_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zul_Latn': {'number_of_characters': 69413, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.24, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Arab-arb_Latn': {'number_of_characters': 61298, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 123.61, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Latn-arb_Arab': {'number_of_characters': 53671, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 107.98, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Beng-ben_Latn': {'number_of_characters': 68285, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 137.93, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Latn-ben_Beng': {'number_of_characters': 63512, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 128.15, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Deva-hin_Latn': {'number_of_characters': 68307, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.97, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Latn-hin_Deva': {'number_of_characters': 66332, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.93, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Deva-npi_Latn': {'number_of_characters': 65683, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 132.6, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Latn-npi_Deva': {'number_of_characters': 61183, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 123.38, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Sinh-sin_Latn': {'number_of_characters': 85996, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 174.22, 'max_document_length': 224, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Latn-sin_Sinh': {'number_of_characters': 63902, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 128.95, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Arab-urd_Latn': {'number_of_characters': 82039, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 166.11, 'max_document_length': 230, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Latn-urd_Arab': {'number_of_characters': 64450, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 130.07, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}}}} | | [BengaliDocumentClassification](https://aclanthology.org/2023.eacl-main.4) | ['ben'] | Classification | s2s | [News, Written] | None | None | | [BengaliHateSpeechClassification](https://huggingface.co/datasets/bn_hate_speech) (Karim et al., 2020) | ['ben'] | Classification | s2s | [News, Written] | None | None | | [BengaliSentimentAnalysis](https://data.mendeley.com/datasets/p6zc7krs37/4) (Sazzed et al., 2020) | ['ben'] | Classification | s2s | [Reviews, Written] | None | None | +| [BeytooteClustering](https://mcinext.com/) | ['fas'] | Clustering | p2p | [News] | None | None | | [BibleNLPBitextMining](https://arxiv.org/abs/2304.09919) (Akerman et al., 2023) | ['aai', 'aak', 'aau', 'aaz', 'abt', 'abx', 'aby', 'acf', 'acr', 'acu', 'adz', 'aer', 'aey', 'agd', 'agg', 'agm', 'agn', 'agr', 'agt', 'agu', 'aia', 'aii', 'aka', 'ake', 'alp', 'alq', 'als', 'aly', 'ame', 'amf', 'amk', 'amm', 'amn', 'amo', 'amp', 'amr', 'amu', 'amx', 'anh', 'anv', 'aoi', 'aoj', 'aom', 'aon', 'apb', 'ape', 'apn', 'apr', 'apu', 'apw', 'apz', 'arb', 'are', 'arl', 'arn', 'arp', 'asm', 'aso', 'ata', 'atb', 'atd', 'atg', 'att', 'auc', 'aui', 'auy', 'avt', 'awb', 'awk', 'awx', 'azb', 'azg', 'azz', 'bao', 'bba', 'bbb', 'bbr', 'bch', 'bco', 'bdd', 'bea', 'bef', 'bel', 'ben', 'beo', 'beu', 'bgs', 'bgt', 'bhg', 'bhl', 'big', 'bjk', 'bjp', 'bjr', 'bjv', 'bjz', 'bkd', 'bki', 'bkq', 'bkx', 'blw', 'blz', 'bmh', 'bmk', 'bmr', 'bmu', 'bnp', 'boa', 'boj', 'bon', 'box', 'bpr', 'bps', 'bqc', 'bqp', 'bre', 'bsj', 'bsn', 'bsp', 'bss', 'buk', 'bus', 'bvd', 'bvr', 'bxh', 'byr', 'byx', 'bzd', 'bzh', 'bzj', 'caa', 'cab', 'cac', 'caf', 'cak', 'cao', 'cap', 'car', 'cav', 'cax', 'cbc', 'cbi', 'cbk', 'cbr', 'cbs', 'cbt', 'cbu', 'cbv', 'cco', 'ceb', 'cek', 'ces', 'cgc', 'cha', 'chd', 'chf', 'chk', 'chq', 'chz', 'cjo', 'cjv', 'ckb', 'cle', 'clu', 'cme', 'cmn', 'cni', 'cnl', 'cnt', 'cof', 'con', 'cop', 'cot', 'cpa', 'cpb', 'cpc', 'cpu', 'cpy', 'crn', 'crx', 'cso', 'csy', 'cta', 'cth', 'ctp', 'ctu', 'cub', 'cuc', 'cui', 'cuk', 'cut', 'cux', 'cwe', 'cya', 'daa', 'dad', 'dah', 'dan', 'ded', 'deu', 'dgc', 'dgr', 'dgz', 'dhg', 'dif', 'dik', 'dji', 'djk', 'djr', 'dob', 'dop', 'dov', 'dwr', 'dww', 'dwy', 'ebk', 'eko', 'emi', 'emp', 'eng', 'enq', 'epo', 'eri', 'ese', 'esk', 'etr', 'ewe', 'faa', 'fai', 'far', 'ffm', 'for', 'fra', 'fue', 'fuf', 'fuh', 'gah', 'gai', 'gam', 'gaw', 'gdn', 'gdr', 'geb', 'gfk', 'ghs', 'glk', 'gmv', 'gng', 'gnn', 'gnw', 'gof', 'grc', 'gub', 'guh', 'gui', 'guj', 'gul', 'gum', 'gun', 'guo', 'gup', 'gux', 'gvc', 'gvf', 'gvn', 'gvs', 'gwi', 'gym', 'gyr', 'hat', 'hau', 'haw', 'hbo', 'hch', 'heb', 'heg', 'hin', 'hix', 'hla', 'hlt', 'hmo', 'hns', 'hop', 'hot', 'hrv', 'hto', 'hub', 'hui', 'hun', 'hus', 'huu', 'huv', 'hvn', 'ian', 'ign', 'ikk', 'ikw', 'ilo', 'imo', 'inb', 'ind', 'ino', 'iou', 'ipi', 'isn', 'ita', 'iws', 'ixl', 'jac', 'jae', 'jao', 'jic', 'jid', 'jiv', 'jni', 'jpn', 'jvn', 'kan', 'kaq', 'kbc', 'kbh', 'kbm', 'kbq', 'kdc', 'kde', 'kdl', 'kek', 'ken', 'kew', 'kgf', 'kgk', 'kgp', 'khs', 'khz', 'kik', 'kiw', 'kiz', 'kje', 'kjs', 'kkc', 'kkl', 'klt', 'klv', 'kmg', 'kmh', 'kmk', 'kmo', 'kms', 'kmu', 'kne', 'knf', 'knj', 'knv', 'kos', 'kpf', 'kpg', 'kpj', 'kpr', 'kpw', 'kpx', 'kqa', 'kqc', 'kqf', 'kql', 'kqw', 'ksd', 'ksj', 'ksr', 'ktm', 'kto', 'kud', 'kue', 'kup', 'kvg', 'kvn', 'kwd', 'kwf', 'kwi', 'kwj', 'kyc', 'kyf', 'kyg', 'kyq', 'kyz', 'kze', 'lac', 'lat', 'lbb', 'lbk', 'lcm', 'leu', 'lex', 'lgl', 'lid', 'lif', 'lin', 'lit', 'llg', 'lug', 'luo', 'lww', 'maa', 'maj', 'mal', 'mam', 'maq', 'mar', 'mau', 'mav', 'maz', 'mbb', 'mbc', 'mbh', 'mbj', 'mbl', 'mbs', 'mbt', 'mca', 'mcb', 'mcd', 'mcf', 'mco', 'mcp', 'mcq', 'mcr', 'mdy', 'med', 'mee', 'mek', 'meq', 'met', 'meu', 'mgc', 'mgh', 'mgw', 'mhl', 'mib', 'mic', 'mie', 'mig', 'mih', 'mil', 'mio', 'mir', 'mit', 'miz', 'mjc', 'mkj', 'mkl', 'mkn', 'mks', 'mle', 'mlh', 'mlp', 'mmo', 'mmx', 'mna', 'mop', 'mox', 'mph', 'mpj', 'mpm', 'mpp', 'mps', 'mpt', 'mpx', 'mqb', 'mqj', 'msb', 'msc', 'msk', 'msm', 'msy', 'mti', 'mto', 'mux', 'muy', 'mva', 'mvn', 'mwc', 'mwe', 'mwf', 'mwp', 'mxb', 'mxp', 'mxq', 'mxt', 'mya', 'myk', 'myu', 'myw', 'myy', 'mzz', 'nab', 'naf', 'nak', 'nas', 'nbq', 'nca', 'nch', 'ncj', 'ncl', 'ncu', 'ndg', 'ndj', 'nfa', 'ngp', 'ngu', 'nhe', 'nhg', 'nhi', 'nho', 'nhr', 'nhu', 'nhw', 'nhy', 'nif', 'nii', 'nin', 'nko', 'nld', 'nlg', 'nna', 'nnq', 'noa', 'nop', 'not', 'nou', 'npi', 'npl', 'nsn', 'nss', 'ntj', 'ntp', 'ntu', 'nuy', 'nvm', 'nwi', 'nya', 'nys', 'nyu', 'obo', 'okv', 'omw', 'ong', 'ons', 'ood', 'opm', 'ory', 'ote', 'otm', 'otn', 'otq', 'ots', 'pab', 'pad', 'pah', 'pan', 'pao', 'pes', 'pib', 'pio', 'pir', 'piu', 'pjt', 'pls', 'plu', 'pma', 'poe', 'poh', 'poi', 'pol', 'pon', 'por', 'poy', 'ppo', 'prf', 'pri', 'ptp', 'ptu', 'pwg', 'qub', 'quc', 'quf', 'quh', 'qul', 'qup', 'qvc', 'qve', 'qvh', 'qvm', 'qvn', 'qvs', 'qvw', 'qvz', 'qwh', 'qxh', 'qxn', 'qxo', 'rai', 'reg', 'rgu', 'rkb', 'rmc', 'rmy', 'ron', 'roo', 'rop', 'row', 'rro', 'ruf', 'rug', 'rus', 'rwo', 'sab', 'san', 'sbe', 'sbk', 'sbs', 'seh', 'sey', 'sgb', 'sgz', 'shj', 'shp', 'sim', 'sja', 'sll', 'smk', 'snc', 'snn', 'snp', 'snx', 'sny', 'som', 'soq', 'soy', 'spa', 'spl', 'spm', 'spp', 'sps', 'spy', 'sri', 'srm', 'srn', 'srp', 'srq', 'ssd', 'ssg', 'ssx', 'stp', 'sua', 'sue', 'sus', 'suz', 'swe', 'swh', 'swp', 'sxb', 'tac', 'taj', 'tam', 'tav', 'taw', 'tbc', 'tbf', 'tbg', 'tbo', 'tbz', 'tca', 'tcs', 'tcz', 'tdt', 'tee', 'tel', 'ter', 'tet', 'tew', 'tfr', 'tgk', 'tgl', 'tgo', 'tgp', 'tha', 'tif', 'tim', 'tiw', 'tiy', 'tke', 'tku', 'tlf', 'tmd', 'tna', 'tnc', 'tnk', 'tnn', 'tnp', 'toc', 'tod', 'tof', 'toj', 'ton', 'too', 'top', 'tos', 'tpa', 'tpi', 'tpt', 'tpz', 'trc', 'tsw', 'ttc', 'tte', 'tuc', 'tue', 'tuf', 'tuo', 'tur', 'tvk', 'twi', 'txq', 'txu', 'tzj', 'tzo', 'ubr', 'ubu', 'udu', 'uig', 'ukr', 'uli', 'ulk', 'upv', 'ura', 'urb', 'urd', 'uri', 'urt', 'urw', 'usa', 'usp', 'uvh', 'uvl', 'vid', 'vie', 'viv', 'vmy', 'waj', 'wal', 'wap', 'wat', 'wbi', 'wbp', 'wed', 'wer', 'wim', 'wiu', 'wiv', 'wmt', 'wmw', 'wnc', 'wnu', 'wol', 'wos', 'wrk', 'wro', 'wrs', 'wsk', 'wuv', 'xav', 'xbi', 'xed', 'xla', 'xnn', 'xon', 'xsi', 'xtd', 'xtm', 'yaa', 'yad', 'yal', 'yap', 'yaq', 'yby', 'ycn', 'yka', 'yle', 'yml', 'yon', 'yor', 'yrb', 'yre', 'yss', 'yuj', 'yut', 'yuw', 'yva', 'zaa', 'zab', 'zac', 'zad', 'zai', 'zaj', 'zam', 'zao', 'zap', 'zar', 'zas', 'zat', 'zav', 'zaw', 'zca', 'zga', 'zia', 'ziw', 'zlm', 'zos', 'zpc', 'zpl', 'zpm', 'zpo', 'zpq', 'zpu', 'zpv', 'zpz', 'zsr', 'ztq', 'zty', 'zyp'] | BitextMining | s2s | [Religious, Written] | None | None | | [BigPatentClustering.v2](https://huggingface.co/datasets/NortheasternUniversity/big_patent) (Eva Sharma and Chen Li and Lu Wang, 2019) | ['eng'] | Clustering | p2p | [Legal, Written] | None | None | | [BiorxivClusteringP2P.v2](https://api.biorxiv.org/) | ['eng'] | Clustering | p2p | [Academic, Written] | None | None | | [BiorxivClusteringS2S.v2](https://api.biorxiv.org/) | ['eng'] | Clustering | s2s | [Academic, Written] | None | None | +| [Birdsnap](https://openaccess.thecvf.com/content_cvpr_2014/html/Berg_Birdsnap_Large-scale_Fine-grained_2014_CVPR_paper.html) (Berg et al., 2014) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [BirdsnapZeroShot](https://openaccess.thecvf.com/content_cvpr_2014/html/Berg_Birdsnap_Large-scale_Fine-grained_2014_CVPR_paper.html) (Berg et al., 2014) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | | [BlurbsClusteringP2P.v2](https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html) (Steffen Remus, 2019) | ['deu'] | Clustering | p2p | [Fiction, Written] | None | None | | [BlurbsClusteringS2S.v2](https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html) (Steffen Remus, 2019) | ['deu'] | Clustering | s2s | [Fiction, Written] | None | None | -| [BornholmBitextMining](https://aclanthology.org/W19-6138/) | ['dan'] | BitextMining | s2s | [Web, Social, Fiction, Written] | {'test': 500} | {'test': {'num_samples': 500, 'number_of_characters': 44361, 'unique_pairs': 500, 'min_sentence1_length': 1, 'average_sentence1_length': 49.83, 'max_sentence1_length': 555, 'unique_sentence1': 497, 'min_sentence2_length': 5, 'average_sentence2_length': 38.89, 'max_sentence2_length': 453, 'unique_sentence2': 491}} | +| [BornholmBitextMining](https://aclanthology.org/W19-6138/) | ['dan'] | BitextMining | s2s | [Fiction, Social, Web, Written] | {'test': 500} | {'test': {'num_samples': 500, 'number_of_characters': 44361, 'unique_pairs': 500, 'min_sentence1_length': 1, 'average_sentence1_length': 49.83, 'max_sentence1_length': 555, 'unique_sentence1': 497, 'min_sentence2_length': 5, 'average_sentence2_length': 38.89, 'max_sentence2_length': 453, 'unique_sentence2': 491}} | | [BrazilianToxicTweetsClassification](https://paperswithcode.com/dataset/told-br) (Joao Augusto Leite and Diego F. Silva and Kalina Bontcheva and Carolina Scarton, 2020) | ['por'] | MultilabelClassification | s2s | [Constructed, Written] | None | None | -| [BrightRetrieval](https://huggingface.co/datasets/xlangai/BRIGHT) (Hongjin Su, 2024) | ['eng'] | Retrieval | s2p | [Non-fiction] | None | None | +| [BrightRetrieval](https://huggingface.co/datasets/xlangai/BRIGHT) (Hongjin Su, 2024) | ['eng'] | Retrieval | s2p | [Non-fiction, Written] | None | None | | [BulgarianStoreReviewSentimentClassfication](https://doi.org/10.7910/DVN/TXIK9P) (Georgieva-Trifonova et al., 2018) | ['bul'] | Classification | s2s | [Reviews, Written] | None | None | -| [CBD](http://2019.poleval.pl/files/poleval2019.pdf) | ['pol'] | Classification | s2s | [Written, Social] | None | None | +| [CBD](http://2019.poleval.pl/files/poleval2019.pdf) | ['pol'] | Classification | s2s | [Social, Written] | None | None | | [CDSC-E](https://aclanthology.org/P17-1073.pdf) | ['pol'] | PairClassification | s2s | [Written] | None | None | -<<<<<<< HEAD -| [CDSC-R](https://aclanthology.org/P17-1073.pdf) | ['pol'] | STS | s2s | [Web, Written] | {'test': 1000} | {'test': 75.24} | -| [CEDRClassification](https://www.sciencedirect.com/science/article/pii/S1877050921013247) (Sboev et al., 2021) | ['rus'] | MultilabelClassification | s2s | [Web, Social, Blog, Written] | {'test': 1882} | {'test': {'average_text_length': 91.20563230605738, 'average_label_per_text': 0.620616365568544, 'num_samples': 1882, 'unique_labels': 6, 'labels': {'null': {'count': 734}, '3': {'count': 141}, '2': {'count': 170}, '1': {'count': 379}, '0': {'count': 353}, '4': {'count': 125}}}} | -| [CLSClusteringP2P.v2](https://arxiv.org/abs/2209.05034) (Yudong Li, 2022) | ['cmn'] | Clustering | p2p | [Academic, Written] | {'test': 2048} | {} | -| [CLSClusteringS2S.v2](https://arxiv.org/abs/2209.05034) (Yudong Li, 2022) | ['cmn'] | Clustering | s2s | [Academic, Written] | {'test': 2048} | {} | -| [CMedQAv1-reranking](https://github.com/zhangsheng93/cMedQA) (Zhang et al., 2017) | ['cmn'] | Reranking | s2s | [Medical, Written] | {'test': 2000} | {'test': 165} | -| [CMedQAv2-reranking](https://github.com/zhangsheng93/cMedQA2) (S. Zhang, 2018) | ['cmn'] | Reranking | s2s | | None | None | -| [COIRCodeSearchNetRetrieval](https://huggingface.co/datasets/code_search_net/) (Husain et al., 2019) | ['go', 'java', 'javascript', 'php', 'python', 'ruby'] | Retrieval | p2p | [Programming, Written] | {'test': 1000} | {'test': {'python': {'average_document_length': 466.546, 'average_query_length': 862.842, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'javascript': {'average_document_length': 186.018, 'average_query_length': 1415.632, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'go': {'average_document_length': 125.213, 'average_query_length': 563.729, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'ruby': {'average_document_length': 313.818, 'average_query_length': 577.634, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'java': {'average_document_length': 420.287, 'average_query_length': 690.36, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'php': {'average_document_length': 162.119, 'average_query_length': 712.129, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}}} | -| [CPUSpeedTask](https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/blob/c8376f967d1294419be1d3eb41217d04cd3a65d3/src/seb/registered_tasks/speed.py#L83-L96) | ['eng'] | Speed | s2s | [Fiction, Written] | {'test': 1} | {'test': 3591} | -| [CQADupstackAndroidRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | {'test': {'average_document_length': 593.701974084703, 'average_query_length': 51.76680972818312, 'num_documents': 22998, 'num_queries': 699, 'average_relevant_docs_per_query': 2.4263233190271816}} | -| [CQADupstackEnglishRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | {'test': {'average_document_length': 482.4710971880361, 'average_query_length': 48.32993630573248, 'num_documents': 40221, 'num_queries': 1570, 'average_relevant_docs_per_query': 2.3980891719745223}} | -| [CQADupstackGamingRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | {'test': {'average_document_length': 488.74152888457206, 'average_query_length': 48.772413793103446, 'num_documents': 45301, 'num_queries': 1595, 'average_relevant_docs_per_query': 1.418808777429467}} | -| [CQADupstackGisRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | {'test': {'average_document_length': 1012.167813587693, 'average_query_length': 52.2, 'num_documents': 37637, 'num_queries': 885, 'average_relevant_docs_per_query': 1.2587570621468926}} | -| [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | {'test': {'average_document_length': 1153.4967375037413, 'average_query_length': 48.90547263681592, 'num_documents': 16705, 'num_queries': 804, 'average_relevant_docs_per_query': 1.6890547263681592}} | -| [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | {'test': {'average_document_length': 818.6476145735463, 'average_query_length': 53.36477382098171, 'num_documents': 38316, 'num_queries': 1039, 'average_relevant_docs_per_query': 1.8604427333974976}} | -| [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | {'test': {'average_document_length': 1055.7033814022875, 'average_query_length': 55.1837899543379, 'num_documents': 32176, 'num_queries': 876, 'average_relevant_docs_per_query': 1.9121004566210045}} | -| [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | {'test': {'average_document_length': 1055.1668598736662, 'average_query_length': 56.31748466257669, 'num_documents': 42269, 'num_queries': 652, 'average_relevant_docs_per_query': 1.4003067484662577}} | -| [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | {'test': {'average_document_length': 1297.09043177285, 'average_query_length': 46.935306262904334, 'num_documents': 68184, 'num_queries': 2906, 'average_relevant_docs_per_query': 1.7735719201651754}} | -| [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | {'test': {'average_document_length': 1004.8120383267908, 'average_query_length': 50.32369402985075, 'num_documents': 47382, 'num_queries': 1072, 'average_relevant_docs_per_query': 1.5792910447761195}} | -| [CQADupstackWebmastersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | {'test': {'average_document_length': 707.3635736857225, 'average_query_length': 51.93478260869565, 'num_documents': 17405, 'num_queries': 506, 'average_relevant_docs_per_query': 2.7569169960474307}} | -| [CQADupstackWordpressRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | {'test': {'average_document_length': 1122.7690155333814, 'average_query_length': 48.7264325323475, 'num_documents': 48605, 'num_queries': 541, 'average_relevant_docs_per_query': 1.3752310536044363}} | -| [CSFDCZMovieReviewSentimentClassification](https://arxiv.org/abs/2304.01922) (Michal ล tefรกnik, 2023) | ['ces'] | Classification | s2s | [Reviews, Written] | {'test': 2048} | {'test': 386.5} | -| [CSFDSKMovieReviewSentimentClassification](https://arxiv.org/abs/2304.01922) (Michal ล tefรกnik, 2023) | ['slk'] | Classification | s2s | [Reviews, Written] | {'test': 2048} | {'test': 366.2} | -| [CTKFactsNLI](https://arxiv.org/abs/2201.11115) (Ullrich et al., 2023) | ['ces'] | PairClassification | s2s | [News, Written] | {'test': 375, 'validation': 305} | {'test': 225.62, 'validation': 219.32} | -| [CUADAffiliateLicenseLicenseeLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 198} | {'test': 484.11} | -| [CUADAffiliateLicenseLicensorLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 88} | {'test': 633.4} | -| [CUADAntiAssignmentLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 1172} | {'test': 340.81} | -| [CUADAuditRightsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 1216} | {'test': 337.14} | -| [CUADCapOnLiabilityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 1246} | {'test': 375.74} | -| [CUADChangeOfControlLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 416} | {'test': 391.96} | -| [CUADCompetitiveRestrictionExceptionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 220} | {'test': 433.04} | -| [CUADCovenantNotToSueLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 308} | {'test': 402.97} | -| [CUADEffectiveDateLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 236} | {'test': 277.62} | -| [CUADExclusivityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 762} | {'test': 369.17} | -| [CUADExpirationDateLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 876} | {'test': 309.27} | -| [CUADGoverningLawLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 876} | {'test': 289.87} | -| [CUADIPOwnershipAssignmentLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 576} | {'test': 414.0} | -| [CUADInsuranceLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 1030} | {'test': 365.54} | -| [CUADIrrevocableOrPerpetualLicenseLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 280} | {'test': 473.4} | -| [CUADJointIPOwnershipLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 192} | {'test': 374.17} | -| [CUADLicenseGrantLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 1396} | {'test': 409.89} | -| [CUADLiquidatedDamagesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 220} | {'test': 351.76} | -| [CUADMinimumCommitmentLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 772} | {'test': 364.16} | -| [CUADMostFavoredNationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 64} | {'test': 418.75} | -| [CUADNoSolicitOfCustomersLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 84} | {'test': 392.89} | -| [CUADNoSolicitOfEmployeesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 142} | {'test': 417.94} | -| [CUADNonCompeteLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 442} | {'test': 383.2} | -| [CUADNonDisparagementLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 100} | {'test': 403.08} | -| [CUADNonTransferableLicenseLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 542} | {'test': 399.16} | -| [CUADNoticePeriodToTerminateRenewalLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 222} | {'test': 354.85} | -| [CUADPostTerminationServicesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 808} | {'test': 422.53} | -| [CUADPriceRestrictionsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 46} | {'test': 324.71} | -| [CUADRenewalTermLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 386} | {'test': 340.87} | -| [CUADRevenueProfitSharingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 774} | {'test': 371.55} | -| [CUADRofrRofoRofnLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 690} | {'test': 395.46} | -| [CUADSourceCodeEscrowLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 118} | {'test': 399.18} | -| [CUADTerminationForConvenienceLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 430} | {'test': 326.3} | -| [CUADThirdPartyBeneficiaryLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 68} | {'test': 261.04} | -| [CUADUncappedLiabilityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 294} | {'test': 441.04} | -| [CUADUnlimitedAllYouCanEatLicenseLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 48} | {'test': 368.08} | -| [CUADVolumeRestrictionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 322} | {'test': 306.27} | -| [CUADWarrantyDurationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 320} | {'test': 352.27} | -| [CanadaTaxCourtOutcomesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 244} | {'test': 622.6} | -| [CataloniaTweetClassification](https://aclanthology.org/2020.lrec-1.171/) | ['cat', 'spa'] | Classification | s2s | [Social, Government, Written] | {'validation': 2000, 'test': 2000} | {'validation': 202.61, 'test': 200.49} | -| [ClimateFEVER](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | | None | {'test': {'average_document_length': 538.241873443325, 'average_query_length': 123.39934853420195, 'num_documents': 5416593, 'num_queries': 1535, 'average_relevant_docs_per_query': 3.0495114006514656}} | -| [ClimateFEVERHardNegatives](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | | {'test': 1000} | {'test': {'average_document_length': 1245.4236333727013, 'average_query_length': 121.879, 'num_documents': 47416, 'num_queries': 1000, 'average_relevant_docs_per_query': 3.048}} | -| [CmedqaRetrieval](https://aclanthology.org/2022.emnlp-main.357.pdf) | ['cmn'] | Retrieval | s2p | | None | {'dev': {'average_document_length': 307.7710222897771, 'average_query_length': 48.470367591897976, 'num_documents': 100001, 'num_queries': 3999, 'average_relevant_docs_per_query': 1.86271567891973}} | -| [Cmnli](https://huggingface.co/datasets/clue/viewer/cmnli) | ['cmn'] | PairClassification | s2s | | None | None | -| [CodeEditSearchRetrieval](https://huggingface.co/datasets/cassanof/CodeEditSearch/viewer) (Niklas Muennighoff, 2023) | ['c', 'c++', 'go', 'java', 'javascript', 'php', 'python', 'ruby', 'rust', 'scala', 'shell', 'swift', 'typescript'] | Retrieval | p2p | [Programming, Written] | {'train': 13000} | {'train': {'python': {'average_document_length': 597.592, 'average_query_length': 69.519, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'javascript': {'average_document_length': 582.554, 'average_query_length': 56.88, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'typescript': {'average_document_length': 580.877, 'average_query_length': 60.092, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'go': {'average_document_length': 548.498, 'average_query_length': 70.797, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'ruby': {'average_document_length': 518.895, 'average_query_length': 66.9, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'java': {'average_document_length': 620.332, 'average_query_length': 62.984, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'php': {'average_document_length': 545.452, 'average_query_length': 61.927, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'c': {'average_document_length': 475.868, 'average_query_length': 97.588, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'c++': {'average_document_length': 544.446, 'average_query_length': 114.48, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'rust': {'average_document_length': 609.548, 'average_query_length': 67.503, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'swift': {'average_document_length': 574.62, 'average_query_length': 57.279, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'scala': {'average_document_length': 495.485, 'average_query_length': 64.833, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'shell': {'average_document_length': 486.519, 'average_query_length': 72.059, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}}} | -| [CodeFeedbackMT](https://arxiv.org/abs/2402.14658) (Tianyu Zheng, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 1000} | {'test': {'average_document_length': 1467.879728243677, 'average_query_length': 4425.522256533855, 'num_documents': 66383, 'num_queries': 13277, 'average_relevant_docs_per_query': 1.0}} | -| [CodeFeedbackST](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 1000} | {'test': {'average_document_length': 1521.3317148588733, 'average_query_length': 724.2441704465598, 'num_documents': 156526, 'num_queries': 31306, 'average_relevant_docs_per_query': 1.0}} | -| [CodeSearchNetCCRetrieval](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['go', 'java', 'javascript', 'php', 'python', 'ruby'] | Retrieval | p2p | [Programming, Written] | {'test': 1000} | {'test': {'python': {'average_document_length': 388.31577184555965, 'average_query_length': 551.7934039415471, 'num_documents': 280652, 'num_queries': 14918, 'average_relevant_docs_per_query': 1.0}, 'javascript': {'average_document_length': 276.0730050152605, 'average_query_length': 443.70707991491946, 'num_documents': 65201, 'num_queries': 3291, 'average_relevant_docs_per_query': 1.0}, 'go': {'average_document_length': 185.0307932251621, 'average_query_length': 233.76803742920464, 'num_documents': 182735, 'num_queries': 8122, 'average_relevant_docs_per_query': 1.0}, 'ruby': {'average_document_length': 214.86204146730464, 'average_query_length': 266.8731165741475, 'num_documents': 27588, 'num_queries': 1261, 'average_relevant_docs_per_query': 1.0}, 'java': {'average_document_length': 281.96280259139183, 'average_query_length': 342.5341853035144, 'num_documents': 181061, 'num_queries': 10955, 'average_relevant_docs_per_query': 1.0}, 'php': {'average_document_length': 268.9752569556027, 'average_query_length': 336.62194947909234, 'num_documents': 268237, 'num_queries': 14014, 'average_relevant_docs_per_query': 1.0}}} | -| [CodeSearchNetRetrieval](https://huggingface.co/datasets/code_search_net/) (Husain et al., 2019) | ['go', 'java', 'javascript', 'php', 'python', 'ruby'] | Retrieval | p2p | [Programming, Written] | {'test': 1000} | {'test': {'python': {'average_document_length': 862.842, 'average_query_length': 466.546, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'javascript': {'average_document_length': 1415.632, 'average_query_length': 186.018, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'go': {'average_document_length': 563.729, 'average_query_length': 125.213, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'ruby': {'average_document_length': 577.634, 'average_query_length': 313.818, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'java': {'average_document_length': 420.287, 'average_query_length': 690.36, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'php': {'average_document_length': 712.129, 'average_query_length': 162.119, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}}} | -| [CodeTransOceanContest](https://arxiv.org/abs/2310.04951) (Weixiang Yan, 2023) | ['c++', 'python'] | Retrieval | p2p | [Programming, Written] | | {'test': {'average_document_length': 1528.9156746031747, 'average_query_length': 1012.1131221719457, 'num_documents': 1008, 'num_queries': 221, 'average_relevant_docs_per_query': 1.0}} | -| [CodeTransOceanDL](https://arxiv.org/abs/2310.04951) (Weixiang Yan, 2023) | ['python'] | Retrieval | p2p | [Programming, Written] | | {'test': {'average_document_length': 1479.0735294117646, 'average_query_length': 1867.6222222222223, 'num_documents': 816, 'num_queries': 180, 'average_relevant_docs_per_query': 1.0}} | -| [ContractNLIConfidentialityOfAgreementLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 82} | {'test': 473.17} | -| [ContractNLIExplicitIdentificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 109} | {'test': 506.12} | -| [ContractNLIInclusionOfVerballyConveyedInformationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 139} | {'test': 525.75} | -| [ContractNLILimitedUseLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 208} | {'test': 407.51} | -| [ContractNLINoLicensingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 162} | {'test': 419.42} | -| [ContractNLINoticeOnCompelledDisclosureLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 142} | {'test': 503.45} | -| [ContractNLIPermissibleAcquirementOfSimilarInformationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 178} | {'test': 427.4} | -| [ContractNLIPermissibleCopyLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 87} | {'test': 386.84} | -| [ContractNLIPermissibleDevelopmentOfSimilarInformationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 136} | {'test': 396.4} | -| [ContractNLIPermissiblePostAgreementPossessionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 111} | {'test': 529.09} | -| [ContractNLIReturnOfConfidentialInformationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 66} | {'test': 478.29} | -| [ContractNLISharingWithEmployeesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 170} | {'test': 548.63} | -| [ContractNLISharingWithThirdPartiesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 180} | {'test': 517.29} | -| [ContractNLISurvivalOfObligationsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 157} | {'test': 417.64} | -| [Core17InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionRetrieval | s2p | [News, Written] | {'eng': 39838} | {'test': {'num_docs': 19899, 'num_queries': 20, 'average_document_length': 2233.0329664807277, 'average_query_length': 109.75, 'average_instruction_length': 295.55, 'average_changed_instruction_length': 355.2, 'average_relevant_docs_per_query': 32.7, 'average_top_ranked_per_query': 1000.0}} | -| [CorporateLobbyingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 490} | {'test': 6039.85} | -| [CosQA](https://arxiv.org/abs/2105.13239) (Junjie Huang, 2021) | ['eng', 'python'] | Retrieval | p2p | [Programming, Written] | | {'test': {'average_document_length': 276.132741215298, 'average_query_length': 36.814, 'num_documents': 20604, 'num_queries': 500, 'average_relevant_docs_per_query': 1.0}} | -| [CovidRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | {'dev': {'average_document_length': 332.4152658473415, 'average_query_length': 25.9304531085353, 'num_documents': 100001, 'num_queries': 949, 'average_relevant_docs_per_query': 1.0105374077976819}} | -| [CrossLingualSemanticDiscriminationWMT19](https://huggingface.co/datasets/Andrianos/clsd_wmt19_21) | ['deu', 'fra'] | Retrieval | s2s | [News, Written] | {'test': 2946} | {'test': {'deu-fra': {'average_document_length': 147.49857433808555, 'average_query_length': 152.95587236931433, 'num_documents': 7365, 'num_queries': 1473, 'average_relevant_docs_per_query': 1.0}, 'fra-deu': {'average_document_length': 154.21968771215208, 'average_query_length': 145.877800407332, 'num_documents': 7365, 'num_queries': 1473, 'average_relevant_docs_per_query': 1.0}}} | -| [CrossLingualSemanticDiscriminationWMT21](https://huggingface.co/datasets/Andrianos/clsd_wmt19_21) | ['deu', 'fra'] | Retrieval | s2s | [News, Written] | {'test': 1786} | {'test': {'deu-fra': {'average_document_length': 177.26270996640537, 'average_query_length': 171.73012318029114, 'num_documents': 4465, 'num_queries': 893, 'average_relevant_docs_per_query': 1.0}, 'fra-deu': {'average_document_length': 174.45061590145576, 'average_query_length': 176.99216125419932, 'num_documents': 4465, 'num_queries': 893, 'average_relevant_docs_per_query': 1.0}}} | -| [CyrillicTurkicLangClassification](https://huggingface.co/datasets/tatiana-merz/cyrillic_turkic_langs) (Goldhahn et al., 2012) | ['bak', 'chv', 'kaz', 'kir', 'krc', 'rus', 'sah', 'tat', 'tyv'] | Classification | s2s | [Web, Written] | {'test': 2048} | {'test': 92.22} | -| [CzechProductReviewSentimentClassification](https://aclanthology.org/W13-1609/) | ['ces'] | Classification | s2s | [Reviews, Written] | {'test': 2048} | {'test': 153.26} | -| [CzechSoMeSentimentClassification](https://aclanthology.org/W13-1609/) | ['ces'] | Classification | s2s | [Reviews, Written] | {'test': 1000} | {'test': 59.89} | -| [CzechSubjectivityClassification](https://arxiv.org/abs/2009.08712) | ['ces'] | Classification | s2s | [Reviews, Written] | {'validation': 500, 'test': 2000} | {'validation': 108.2, 'test': 108.3} | -| [DBPedia](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Written, Encyclopaedic] | None | {'test': {'average_document_length': 1122.7690155333814, 'average_query_length': 48.7264325323475, 'num_documents': 48605, 'num_queries': 541, 'average_relevant_docs_per_query': 1.3752310536044363}} | -| [DBPedia-PL](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['pol'] | Retrieval | s2p | [Written, Encyclopaedic] | None | {'test': {'average_document_length': 311.7007956561823, 'average_query_length': 35.45, 'num_documents': 4635922, 'num_queries': 400, 'average_relevant_docs_per_query': 38.215}} | -| [DBPedia-PLHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['pol'] | Retrieval | s2p | [Written, Encyclopaedic] | {'test': 400} | {'test': {'average_document_length': 363.468546000768, 'average_query_length': 35.45, 'num_documents': 88542, 'num_queries': 400, 'average_relevant_docs_per_query': 38.215}} | -| [DBPediaHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Written, Encyclopaedic] | {'test': 400} | {'test': {'average_document_length': 338.58561119129564, 'average_query_length': 34.085, 'num_documents': 90070, 'num_queries': 400, 'average_relevant_docs_per_query': 38.215}} | -| [DBpediaClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Encyclopaedic, Written] | {'test': 70000} | {'test': 281.4} | -| [DKHateClassification](https://aclanthology.org/2020.lrec-1.430/) | ['dan'] | Classification | s2s | [Social, Written] | {'test': 329} | {'test': 104.0} | -| [DalajClassification](https://spraakbanken.gu.se/en/resources/superlim) | ['swe'] | Classification | s2s | [Non-fiction, Written] | {'test': 444} | {'test': 243.8} | -| [DanFeverRetrieval](https://aclanthology.org/2021.nodalida-main.47/) | ['dan'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Spoken] | {'train': 8897} | {'train': {'average_document_length': 312.1117274167987, 'average_query_length': 50.26957476855484, 'num_documents': 2524, 'num_queries': 6373, 'average_relevant_docs_per_query': 0.48721167425074535}} | -| [DanishPoliticalCommentsClassification](https://huggingface.co/datasets/danish_political_comments) (Mads Guldborg Kjeldgaard Kongsbak, 2019) | ['dan'] | Classification | s2s | [Social, Written] | {'train': 9010} | {'train': 69.9} | -| [DefinitionClassificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 1337} | {'test': 253.72} | -| [DiaBlaBitextMining](https://inria.hal.science/hal-03021633) (Gonzรกlez et al., 2019) | ['eng', 'fra'] | BitextMining | s2s | [Social, Written] | {} | {} | -| [Diversity1LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 300} | {'test': 103.21} | -| [Diversity2LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 300} | {'test': 0} | -| [Diversity3LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 300} | {'test': 135.46} | -| [Diversity4LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 300} | {'test': 144.52} | -| [Diversity5LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 300} | {'test': 174.77} | -| [Diversity6LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 300} | {'test': 301.01} | -| [DuRetrieval](https://aclanthology.org/2022.emnlp-main.357.pdf) (Yifu Qiu, 2022) | ['cmn'] | Retrieval | s2p | | None | {'dev': {'average_document_length': 331.3219967800322, 'average_query_length': 9.289, 'num_documents': 100001, 'num_queries': 2000, 'average_relevant_docs_per_query': 4.9195}} | -| [DutchBookReviewSentimentClassification](https://github.com/benjaminvdb/DBRD) (Benjamin et al., 2019) | ['nld'] | Classification | s2s | [Reviews, Written] | {'test': 2224} | {'test': 1443.0} | -| [ESCIReranking](https://github.com/amazon-science/esci-data/) (Chandan K. Reddy, 2022) | ['eng', 'jpn', 'spa'] | Reranking | s2p | [Written] | | | -| [EcomRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | {'dev': {'average_document_length': 32.98041664189015, 'average_query_length': 6.798, 'num_documents': 100902, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}} | -| [EightTagsClustering.v2](https://aclanthology.org/2020.lrec-1.207.pdf) | ['pol'] | Clustering | s2s | [Social, Written] | {'test': 2048} | {'test': 78.73} | -| [EmotionClassification](https://www.aclweb.org/anthology/D18-1404) | ['eng'] | Classification | s2s | [Social, Written] | {'validation': 2000, 'test': 2000} | {'validation': 95.3, 'test': 95.6} | -| [EstQA](https://www.semanticscholar.org/paper/Extractive-Question-Answering-for-Estonian-Language-182912IAPM-Alum%C3%A4e/ea4f60ab36cadca059c880678bc4c51e293a85d6?utm_source=direct_link) | ['est'] | Retrieval | s2p | [Encyclopaedic, Written] | {'test': 603} | {'test': {'average_document_length': 785.595041322314, 'average_query_length': 55.32006633499171, 'num_documents': 121, 'num_queries': 603, 'average_relevant_docs_per_query': 1.0}} | -| [EstonianValenceClassification](https://figshare.com/articles/dataset/Estonian_Valence_Corpus_Eesti_valentsikorpus/24517054) | ['est'] | Classification | s2s | [News, Written] | {'train': 3270, 'test': 818} | {'train': 226.70642201834863, 'test': 231.5085574572127} | -| [FEVER](https://fever.ai/) | ['eng'] | Retrieval | s2p | | None | {'train': {'average_document_length': 538.2340070317589, 'average_query_length': 47.56034058828886, 'num_documents': 5416568, 'num_queries': 109810, 'average_relevant_docs_per_query': 1.2757034878426372}, 'dev': {'average_document_length': 538.2340070317589, 'average_query_length': 47.326282628262824, 'num_documents': 5416568, 'num_queries': 6666, 'average_relevant_docs_per_query': 1.211971197119712}, 'test': {'average_document_length': 538.2340070317589, 'average_query_length': 49.60546054605461, 'num_documents': 5416568, 'num_queries': 6666, 'average_relevant_docs_per_query': 1.1906690669066906}} | -| [FEVERHardNegatives](https://fever.ai/) | ['eng'] | Retrieval | s2p | | {'test': 1000} | {'test': {'average_document_length': 695.4370242764114, 'average_query_length': 49.62, 'num_documents': 163698, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.171}} | -| [FQuADRetrieval](https://huggingface.co/datasets/manu/fquad2_test) | ['fra'] | Retrieval | s2p | [Encyclopaedic, Written] | {'test': 400, 'validation': 100} | {'test': {'average_document_length': 896.3308550185874, 'average_query_length': 58.52, 'num_documents': 269, 'num_queries': 400, 'average_relevant_docs_per_query': 1.0}, 'validation': {'average_document_length': 895.1340206185567, 'average_query_length': 54.13, 'num_documents': 97, 'num_queries': 100, 'average_relevant_docs_per_query': 1.0}} | -| [FaithDial](https://mcgill-nlp.github.io/FaithDial) (Dziri et al., 2022) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | {'test': 2042} | {'test': {'average_document_length': 140.61062447018932, 'average_query_length': 4.926542605288932, 'num_documents': 3539, 'num_queries': 2042, 'average_relevant_docs_per_query': 1.0}} | -| [FalseFriendsGermanEnglish](https://drive.google.com/file/d/1jgq0nBnV-UiYNxbKNrrr2gxDEHm-DMKH/view?usp=share_link) | ['deu'] | PairClassification | s2s | [Written] | {'test': 1524} | {'test': 40.3} | -| [FaroeseSTS](https://aclanthology.org/2023.nodalida-1.74.pdf) | ['fao'] | STS | s2s | [News, Web, Written] | {'train': 729} | {'train': 43.6} | -| [FarsTail](https://link.springer.com/article/10.1007/s00500-023-08959-3) (Amirkhani et al., 2023) | ['fas'] | PairClassification | s2s | [Academic, Written] | {'test': 1029} | {'test': 125.84} | -| [FeedbackQARetrieval](https://arxiv.org/abs/2204.03025) | ['eng'] | Retrieval | s2p | [Web, Government, Medical, Written] | {'test': 1992} | {'test': {'average_document_length': 1174.7986463620982, 'average_query_length': 72.33182730923694, 'num_documents': 2364, 'num_queries': 1992, 'average_relevant_docs_per_query': 1.0}} | -| [FiQA-PL](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['pol'] | Retrieval | s2p | | None | {'test': {'average_document_length': 795.2371699226205, 'average_query_length': 70.00771604938272, 'num_documents': 57638, 'num_queries': 648, 'average_relevant_docs_per_query': 2.632716049382716}} | -| [FiQA2018](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['eng'] | Retrieval | s2p | | None | {'train': {'average_document_length': 767.2108157812554, 'average_query_length': 61.49763636363636, 'num_documents': 57638, 'num_queries': 5500, 'average_relevant_docs_per_query': 2.5756363636363635}, 'dev': {'average_document_length': 767.2108157812554, 'average_query_length': 62.756, 'num_documents': 57638, 'num_queries': 500, 'average_relevant_docs_per_query': 2.476}, 'test': {'average_document_length': 767.2108157812554, 'average_query_length': 62.7037037037037, 'num_documents': 57638, 'num_queries': 648, 'average_relevant_docs_per_query': 2.632716049382716}} | -| [FilipinoHateSpeechClassification](https://pcj.csp.org.ph/index.php/pcj/issue/download/29/PCJ%20V14%20N1%20pp1-14%202019) (Neil Vicente Cabasag et al., 2019) | ['fil'] | Classification | s2s | [Social, Written] | {'validation': 2048, 'test': 2048} | {'validation': 88.1, 'test': 87.4} | -| [FilipinoShopeeReviewsClassification](https://uijrt.com/articles/v4/i8/UIJRTV4I80009.pdf) | ['fil'] | Classification | s2s | [Social, Written] | {'validation': 2250, 'test': 2250} | {'validation': 143.8, 'test': 145.1} | -| [FinParaSTS](https://huggingface.co/datasets/TurkuNLP/turku_paraphrase_corpus) | ['fin'] | STS | s2s | [News, Subtitles, Written] | {'test': 1000, 'validation': 1000} | {'test': 59.0, 'validation': 58.8} | -| [FinToxicityClassification](https://aclanthology.org/2023.nodalida-1.68) | ['fin'] | Classification | s2s | [News, Written] | {'train': 2048, 'test': 2048} | {'train': 432.63, 'test': 401.03} | -| [FinancialPhrasebankClassification](https://arxiv.org/abs/1307.5336) (P. Malo, 2014) | ['eng'] | Classification | s2s | [News, Written] | {'train': 4840} | {'train': 121.96} | -| [FloresBitextMining](https://huggingface.co/datasets/facebook/flores) (Goyal et al., 2022) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | BitextMining | s2s | [Non-fiction, Encyclopaedic, Written] | {'dev': 997, 'devtest': 1012} | {} | -| [FrenchBookReviews](https://huggingface.co/datasets/Abirate/french_book_reviews) | ['fra'] | Classification | s2s | [Reviews, Written] | {'train': 2048} | {'train': 311.5} | -| [FrenkEnClassification](https://arxiv.org/abs/1906.02045) (Nikola Ljubeลกiฤ‡, 2019) | ['eng'] | Classification | s2s | [Social, Written] | {'test': 2300} | {'test': 188.75} | -| [FrenkHrClassification](https://arxiv.org/abs/1906.02045) (Nikola Ljubeลกiฤ‡, 2019) | ['hrv'] | Classification | s2s | [Social, Written] | {'test': 2120} | {'test': 89.86} | -| [FrenkSlClassification](https://arxiv.org/pdf/1906.02045) (Nikola Ljubeลกiฤ‡, 2019) | ['slv'] | Classification | s2s | [Social, Written] | {'test': 2177} | {'test': 136.61} | -| [FunctionOfDecisionSectionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 367} | {'test': 551.07} | -| [GPUSpeedTask](https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/blob/c8376f967d1294419be1d3eb41217d04cd3a65d3/src/seb/registered_tasks/speed.py#L83-L96) | ['eng'] | Speed | s2s | [Fiction, Written] | {'test': 1} | {'test': 3591} | -| [GeoreviewClassification](https://github.com/yandex/geo-reviews-dataset-2023) | ['rus'] | Classification | p2p | [Reviews, Written] | {'test': 2048} | {'test': 409.0} | -| [GeoreviewClusteringP2P](https://github.com/yandex/geo-reviews-dataset-2023) | ['rus'] | Clustering | p2p | [Reviews, Written] | {'test': 2000} | {'test': 384.5} | -| [GeorgianFAQRetrieval](https://huggingface.co/datasets/jupyterjazz/georgian-faq) | ['kat'] | Retrieval | s2p | [Web, Written] | {'test': 2566} | {'test': {'average_document_length': 511.24668745128605, 'average_query_length': 61.69551656920078, 'num_documents': 2566, 'num_queries': 2565, 'average_relevant_docs_per_query': 1.0003898635477584}} | -| [GerDaLIR](https://github.com/lavis-nlp/GerDaLIR) | ['deu'] | Retrieval | s2p | | None | {'test': {'average_document_length': 15483.237726805888, 'average_query_length': 1027.3495690356156, 'num_documents': 131445, 'num_queries': 12298, 'average_relevant_docs_per_query': 1.1704342169458448}} | -| [GerDaLIRSmall](https://github.com/lavis-nlp/GerDaLIR) | ['deu'] | Retrieval | p2p | [Legal, Written] | None | {'test': {'average_document_length': 19706.823653325308, 'average_query_length': 1031.0680889324833, 'num_documents': 9969, 'num_queries': 12234, 'average_relevant_docs_per_query': 1.1705084191597188}} | -| [GermanDPR](https://huggingface.co/datasets/deepset/germandpr) (Timo Mรถller, 2021) | ['deu'] | Retrieval | s2p | | None | {'test': {'average_document_length': 1288.3410987482614, 'average_query_length': 64.38439024390244, 'num_documents': 2876, 'num_queries': 1025, 'average_relevant_docs_per_query': 1.0}} | -| [GermanGovServiceRetrieval](https://huggingface.co/datasets/it-at-m/LHM-Dienstleistungen-QA) | ['deu'] | Retrieval | s2p | [Government, Written] | {'test': 357} | {'test': {'average_document_length': 1246.4571428571428, 'average_query_length': 68.17977528089888, 'num_documents': 105, 'num_queries': 356, 'average_relevant_docs_per_query': 1.0}} | -| [GermanPoliticiansTwitterSentimentClassification](https://aclanthology.org/2022.konvens-1.9) | ['deu'] | Classification | s2s | [Social, Government, Written] | {'test': 357} | {'test': 302.48} | -| [GermanQuAD-Retrieval](https://www.kaggle.com/datasets/GermanQuAD) (Timo Mรถller, 2021) | ['deu'] | Retrieval | s2p | | None | {'test': {'average_document_length': 1941.090717299578, 'average_query_length': 56.74773139745916, 'num_documents': 474, 'num_queries': 2204, 'average_relevant_docs_per_query': 1.0}} | -| [GermanSTSBenchmark](https://github.com/t-systems-on-site-services-gmbh/german-STSbenchmark) (Philip May, 2021) | ['deu'] | STS | s2s | | None | None | -| [GreekCivicsQA](https://huggingface.co/datasets/antoinelb7/alloprof) | ['ell'] | Retrieval | s2p | [Academic, Written] | {'default': 407} | {'default': {'average_document_length': 1074.894348894349, 'average_query_length': 77.06142506142506, 'num_documents': 407, 'num_queries': 407, 'average_relevant_docs_per_query': 1.0}} | -| [GreekLegalCodeClassification](https://arxiv.org/abs/2109.15298) | ['ell'] | Classification | s2s | [Legal, Written] | {'validation': 2048, 'test': 2048} | {'validation': 4046.8, 'test': 4200.8} | -| [GujaratiNewsClassification](https://github.com/goru001/nlp-for-gujarati) | ['guj'] | Classification | s2s | [News, Written] | {'train': 5269, 'test': 1318} | {'train': 61.95, 'test': 61.91} | -| [HALClusteringS2S.v2](https://huggingface.co/datasets/lyon-nlp/clustering-hal-s2s) (Mathieu Ciancone, 2024) | ['fra'] | Clustering | s2s | [Academic, Written] | {'test': 2048} | {'test': 86.6} | -| [HagridRetrieval](https://github.com/project-miracl/hagrid) (Ehsan Kamalloo, 2023) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | {'train': 1922} | {'dev': {'average_document_length': 228.36693548387098, 'average_query_length': 40.064516129032256, 'num_documents': 496, 'num_queries': 496, 'average_relevant_docs_per_query': 1.0}} | -| [HateSpeechPortugueseClassification](https://aclanthology.org/W19-3510) | ['por'] | Classification | s2s | [Social, Written] | {'train': 2048} | {'train': 101.02} | -| [HeadlineClassification](https://aclanthology.org/2020.ngt-1.6/) | ['rus'] | Classification | s2s | [News, Written] | {'test': 2048} | {'test': 61.6} | -| [HebrewSentimentAnalysis](https://huggingface.co/datasets/hebrew_sentiment) | ['heb'] | Classification | s2s | [Reviews, Written] | {'test': 2048} | {'test': 113.57} | -| [HellaSwag](https://rowanzellers.com/hellaswag/) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | {'test': 10042} | {'test': {'average_document_length': 137.36519014671472, 'average_query_length': 224.53654650468033, 'num_documents': 199162, 'num_queries': 10042, 'average_relevant_docs_per_query': 1.0}} | -| [HinDialectClassification](https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-4839) (Bafna et al., 2022) | ['anp', 'awa', 'ben', 'bgc', 'bhb', 'bhd', 'bho', 'bjj', 'bns', 'bra', 'gbm', 'guj', 'hne', 'kfg', 'kfy', 'mag', 'mar', 'mup', 'noe', 'pan', 'raj'] | Classification | s2s | [Social, Spoken, Written] | {'test': 1152} | {'test': 583.82} | -| [HindiDiscourseClassification](https://aclanthology.org/2020.lrec-1.149/) | ['hin'] | Classification | s2s | [Fiction, Social, Written] | {'train': 2048} | {'train': 79.23828125} | -| [HotelReviewSentimentClassification](https://link.springer.com/chapter/10.1007/978-3-319-67056-0_3) (Elnagar et al., 2018) | ['ara'] | Classification | s2s | [Reviews, Written] | {'train': 2048} | {'train': 137.2} | -| [HotpotQA](https://hotpotqa.github.io/) | ['eng'] | Retrieval | s2p | [Web, Written] | None | {'train': {'average_document_length': 287.9079517072212, 'average_query_length': 105.54965882352941, 'num_documents': 5233329, 'num_queries': 85000, 'average_relevant_docs_per_query': 2.0}, 'dev': {'average_document_length': 287.9079517072212, 'average_query_length': 105.35634294106848, 'num_documents': 5233329, 'num_queries': 5447, 'average_relevant_docs_per_query': 2.0}, 'test': {'average_document_length': 287.9079517072212, 'average_query_length': 92.17096556380824, 'num_documents': 5233329, 'num_queries': 7405, 'average_relevant_docs_per_query': 2.0}} | -| [HotpotQA-PL](https://hotpotqa.github.io/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | {'test': {'average_document_length': 292.26835882093405, 'average_query_length': 94.64064821066847, 'num_documents': 5233329, 'num_queries': 7405, 'average_relevant_docs_per_query': 2.0}} | -| [HotpotQA-PLHardNegatives](https://hotpotqa.github.io/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | {'test': 1000} | {'test': {'average_document_length': 438.3888210025661, 'average_query_length': 95.161, 'num_documents': 212774, 'num_queries': 1000, 'average_relevant_docs_per_query': 2.0}} | -| [HotpotQAHardNegatives](https://hotpotqa.github.io/) | ['eng'] | Retrieval | s2p | [Web, Written] | {'test': 1000} | {'test': {'average_document_length': 373.558822095461, 'average_query_length': 92.584, 'num_documents': 225621, 'num_queries': 1000, 'average_relevant_docs_per_query': 2.0}} | -| [HunSum2AbstractiveRetrieval](https://arxiv.org/abs/2404.03555) (Botond Barta, 2024) | ['hun'] | Retrieval | s2p | [News, Written] | {'test': 1998} | {'test': {'average_document_length': 2511.0315315315315, 'average_query_length': 201.2112112112112, 'num_documents': 1998, 'num_queries': 1998, 'average_relevant_docs_per_query': 1.0}} | -======= | [CDSC-R](https://aclanthology.org/P17-1073.pdf) | ['pol'] | STS | s2s | [Web, Written] | None | None | -| [CEDRClassification](https://www.sciencedirect.com/science/article/pii/S1877050921013247) (Sboev et al., 2021) | ['rus'] | MultilabelClassification | s2s | [Web, Social, Blog, Written] | {'test': 1882, 'train': 7528} | {'test': {'num_samples': 1882, 'number_of_characters': 171649, 'number_texts_in_train': 7, 'min_text_length': 6, 'average_text_length': 91.21, 'max_text_length': 220, 'unique_texts': 1875, 'min_labels_per_text': 0, 'average_label_per_text': 0.62, 'max_labels_per_text': 2, 'unique_labels': 6, 'labels': {'None': {'count': 734}, '3': {'count': 141}, '2': {'count': 170}, '1': {'count': 379}, '0': {'count': 353}, '4': {'count': 125}}}, 'train': {'num_samples': 7528, 'number_of_characters': 697322, 'number_texts_in_train': None, 'min_text_length': 5, 'average_text_length': 92.63, 'max_text_length': 280, 'unique_texts': 7500, 'min_labels_per_text': 0, 'average_label_per_text': 0.61, 'max_labels_per_text': 3, 'unique_labels': 6, 'labels': {'None': {'count': 3043}, '2': {'count': 607}, '0': {'count': 1569}, '3': {'count': 589}, '1': {'count': 1417}, '4': {'count': 411}}}} | +| [CEDRClassification](https://www.sciencedirect.com/science/article/pii/S1877050921013247) (Sboev et al., 2021) | ['rus'] | MultilabelClassification | s2s | [Blog, Social, Web, Written] | {'test': 1882, 'train': 7528} | {'test': {'num_samples': 1882, 'number_of_characters': 171649, 'number_texts_in_train': 7, 'min_text_length': 6, 'average_text_length': 91.21, 'max_text_length': 220, 'unique_texts': 1875, 'min_labels_per_text': 0, 'average_label_per_text': 0.62, 'max_labels_per_text': 2, 'unique_labels': 6, 'labels': {'None': {'count': 734}, '3': {'count': 141}, '2': {'count': 170}, '1': {'count': 379}, '0': {'count': 353}, '4': {'count': 125}}}, 'train': {'num_samples': 7528, 'number_of_characters': 697322, 'number_texts_in_train': None, 'min_text_length': 5, 'average_text_length': 92.63, 'max_text_length': 280, 'unique_texts': 7500, 'min_labels_per_text': 0, 'average_label_per_text': 0.61, 'max_labels_per_text': 3, 'unique_labels': 6, 'labels': {'None': {'count': 3043}, '2': {'count': 607}, '0': {'count': 1569}, '3': {'count': 589}, '1': {'count': 1417}, '4': {'count': 411}}}} | +| [CExaPPC](https://github.com/exaco/exappc) | ['fas'] | PairClassification | s2s | [Social, Web] | None | None | +| [CIFAR10](https://huggingface.co/datasets/uoft-cs/cifar10) (Alex Krizhevsky, 2009) | ['eng'] | ImageClassification | i2i | [Web] | None | None | +| [CIFAR100](https://huggingface.co/datasets/uoft-cs/cifar100) (Alex Krizhevsky, 2009) | ['eng'] | ImageClassification | i2t | [Web] | None | None | +| [CIFAR100Clustering](https://huggingface.co/datasets/uoft-cs/cifar100) (Alex Krizhevsky, 2009) | ['eng'] | ImageClustering | i2t | [Web] | None | None | +| [CIFAR100ZeroShot](https://huggingface.co/datasets/uoft-cs/cifar100) (Alex Krizhevsky, 2009) | ['eng'] | ZeroShotClassification | i2t | [Web] | None | None | +| [CIFAR10Clustering](https://huggingface.co/datasets/uoft-cs/cifar10) (Alex Krizhevsky, 2009) | ['eng'] | ImageClustering | i2i | [Web] | None | None | +| [CIFAR10ZeroShot](https://huggingface.co/datasets/uoft-cs/cifar10) (Alex Krizhevsky, 2009) | ['eng'] | ZeroShotClassification | i2t | [Web] | None | None | +| [CIRRIT2IRetrieval](https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Image_Retrieval_on_Real-Life_Images_With_Pre-Trained_Vision-and-Language_Models_ICCV_2021_paper.html) (Liu et al., 2021) | ['eng'] | Any2AnyRetrieval | it2i | [Encyclopaedic] | None | None | +| [CLEVRCountZeroShot](https://openaccess.thecvf.com/content_cvpr_2017/html/Johnson_CLEVR_A_Diagnostic_CVPR_2017_paper.html) (Johnson et al., 2017) | ['eng'] | ZeroShotClassification | i2t | [Constructed] | None | None | +| [CLEVRZeroShot](https://openaccess.thecvf.com/content_cvpr_2017/html/Johnson_CLEVR_A_Diagnostic_CVPR_2017_paper.html) (Johnson et al., 2017) | ['eng'] | ZeroShotClassification | i2t | [Constructed] | None | None | | [CLSClusteringP2P.v2](https://arxiv.org/abs/2209.05034) (Yudong Li, 2022) | ['cmn'] | Clustering | p2p | [Academic, Written] | None | None | | [CLSClusteringS2S.v2](https://arxiv.org/abs/2209.05034) (Yudong Li, 2022) | ['cmn'] | Clustering | s2s | [Academic, Written] | None | None | | [CMedQAv1-reranking](https://github.com/zhangsheng93/cMedQA) (Zhang et al., 2017) | ['cmn'] | Reranking | s2s | [Medical, Written] | None | None | | [CMedQAv2-reranking](https://github.com/zhangsheng93/cMedQA2) (S. Zhang, 2018) | ['cmn'] | Reranking | s2s | [Medical, Written] | None | None | | [COIRCodeSearchNetRetrieval](https://huggingface.co/datasets/code_search_net/) (Husain et al., 2019) | ['go', 'java', 'javascript', 'php', 'python', 'ruby'] | Retrieval | p2p | [Programming, Written] | {'test': 1056326} | {'test': {'number_of_characters': 36843313, 'num_samples': 1056326, 'num_queries': 52561, 'num_documents': 1003765, 'min_document_length': 54, 'average_document_length': 34.71, 'max_document_length': 334374, 'unique_documents': 1003765, 'min_query_length': 2, 'average_query_length': 38.19, 'max_query_length': 2, 'unique_queries': 52561, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 52561, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 14574651, 'num_samples': 295228, 'num_queries': 14918, 'num_documents': 280310, 'min_document_length': 95, 'average_document_length': 49.99, 'max_document_length': 14008, 'unique_documents': 280310, 'min_query_length': 2, 'average_query_length': 37.58, 'max_query_length': 2, 'unique_queries': 14918, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 14918}, 'javascript': {'number_of_characters': 2587540, 'num_samples': 68145, 'num_queries': 3291, 'num_documents': 64854, 'min_document_length': 87, 'average_document_length': 37.9, 'max_document_length': 334374, 'unique_documents': 64854, 'min_query_length': 2, 'average_query_length': 39.41, 'max_query_length': 2, 'unique_queries': 3291, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3291}, 'go': {'number_of_characters': 3641108, 'num_samples': 190562, 'num_queries': 8122, 'num_documents': 182440, 'min_document_length': 54, 'average_document_length': 17.96, 'max_document_length': 5280, 'unique_documents': 182440, 'min_query_length': 2, 'average_query_length': 44.92, 'max_query_length': 2, 'unique_queries': 8122, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 8122}, 'ruby': {'number_of_characters': 629446, 'num_samples': 28831, 'num_queries': 1261, 'num_documents': 27570, 'min_document_length': 83, 'average_document_length': 20.83, 'max_document_length': 3992, 'unique_documents': 27570, 'min_query_length': 2, 'average_query_length': 43.73, 'max_query_length': 2, 'unique_queries': 1261, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1261}, 'java': {'number_of_characters': 6791137, 'num_samples': 191821, 'num_queries': 10955, 'num_documents': 180866, 'min_document_length': 77, 'average_document_length': 35.55, 'max_document_length': 7615, 'unique_documents': 180866, 'min_query_length': 2, 'average_query_length': 33.02, 'max_query_length': 2, 'unique_queries': 10955, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10955}, 'php': {'number_of_characters': 8619431, 'num_samples': 281739, 'num_queries': 14014, 'num_documents': 267725, 'min_document_length': 94, 'average_document_length': 30.2, 'max_document_length': 4904, 'unique_documents': 267725, 'min_query_length': 2, 'average_query_length': 38.21, 'max_query_length': 2, 'unique_queries': 14014, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 14014}}}} | | [CPUSpeedTask](https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/blob/c8376f967d1294419be1d3eb41217d04cd3a65d3/src/seb/registered_tasks/speed.py#L83-L96) | ['eng'] | Speed | s2s | [Fiction, Written] | None | None | -| [CQADupstackAndroidRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | -| [CQADupstackEnglishRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | -| [CQADupstackGamingRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | -| [CQADupstackGisRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | -| [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | -| [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | -| [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | -| [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | -| [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | -| [CQADupstackWebmastersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | -| [CQADupstackWordpressRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackAndroidRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Non-fiction, Programming, Web, Written] | None | None | +| [CQADupstackAndroidRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-android-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | +| [CQADupstackEnglishRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written] | None | None | +| [CQADupstackEnglishRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-english-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | +| [CQADupstackGamingRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | +| [CQADupstackGamingRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-gaming-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | +| [CQADupstackGisRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Non-fiction, Written] | None | None | +| [CQADupstackGisRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-gis-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | +| [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | +| [CQADupstackMathematicaRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-mathematica-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | +| [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | +| [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | +| [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Non-fiction, Programming, Written] | None | None | +| [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Non-fiction, Programming, Web, Written] | None | None | +| CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | +| [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | +| [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | +| [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Non-fiction, Written] | None | None | +| [CQADupstackTexRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-tex-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | +| [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Web, Written] | None | None | +| [CQADupstackUnixRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-unix-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | +| [CQADupstackWebmastersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | +| [CQADupstackWebmastersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-webmasters-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | +| [CQADupstackWordpressRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Web, Written] | None | None | +| [CQADupstackWordpressRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-wordpress-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CSFDCZMovieReviewSentimentClassification](https://arxiv.org/abs/2304.01922) (Michal ล tefรกnik, 2023) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | | [CSFDSKMovieReviewSentimentClassification](https://arxiv.org/abs/2304.01922) (Michal ล tefรกnik, 2023) | ['slk'] | Classification | s2s | [Reviews, Written] | None | None | | [CTKFactsNLI](https://arxiv.org/abs/2201.11115) (Ullrich et al., 2023) | ['ces'] | PairClassification | s2s | [News, Written] | None | None | @@ -319,16 +155,30 @@ The following tables give you an overview of the tasks in MTEB. | [CUADUnlimitedAllYouCanEatLicenseLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [CUADVolumeRestrictionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [CUADWarrantyDurationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [CUREv1](https://huggingface.co/datasets/clinia/CUREv1) | ['eng', 'fra', 'spa'] | Retrieval | s2p | [Medical, Academic, Written] | None | None | +| [CUB200I2IRetrieval](https://www.florian-schroff.de/publications/CUB-200.pdf) (Welinder et al., 2010) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | None | None | +| [CUREv1](https://huggingface.co/datasets/clinia/CUREv1) | ['eng', 'fra', 'spa'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | +| [CVBenchCount](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | Any2TextMutipleChoice | it2t | [Academic] | None | None | +| [CVBenchDepth](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | Any2TextMutipleChoice | it2t | [Academic] | None | None | +| [CVBenchDistance](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | Any2TextMutipleChoice | it2t | [Academic] | None | None | +| [CVBenchRelation](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | Any2TextMutipleChoice | it2t | [Academic] | None | None | +| [Caltech101](https://ieeexplore.ieee.org/document/1384978) (Li Fei-Fei, 2004) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [Caltech101ZeroShot](https://ieeexplore.ieee.org/document/1384978) (Li Fei-Fei, 2004) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | | [CanadaTaxCourtOutcomesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [CataloniaTweetClassification](https://aclanthology.org/2020.lrec-1.171/) | ['cat', 'spa'] | Classification | s2s | [Social, Government, Written] | None | None | -| [ClimateFEVER](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | | None | None | -| [ClimateFEVERHardNegatives](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | | None | None | +| [CataloniaTweetClassification](https://aclanthology.org/2020.lrec-1.171/) | ['cat', 'spa'] | Classification | s2s | [Government, Social, Written] | None | None | +| [ChemHotpotQARetrieval](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Retrieval | s2p | [Chemistry] | None | None | +| [ChemNQRetrieval](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Retrieval | s2p | [Chemistry] | None | None | +| [ClimateFEVER](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [ClimateFEVER-Fa](https://huggingface.co/datasets/MCINext/climate-fever-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | +| [ClimateFEVERHardNegatives](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [CmedqaRetrieval](https://aclanthology.org/2022.emnlp-main.357.pdf) | ['cmn'] | Retrieval | s2p | [Medical, Written] | None | None | | [Cmnli](https://huggingface.co/datasets/clue/viewer/cmnli) | ['cmn'] | PairClassification | s2s | | None | None | | [CodeEditSearchRetrieval](https://huggingface.co/datasets/cassanof/CodeEditSearch/viewer) (Niklas Muennighoff, 2023) | ['c', 'c++', 'go', 'java', 'javascript', 'php', 'python', 'ruby', 'rust', 'scala', 'shell', 'swift', 'typescript'] | Retrieval | p2p | [Programming, Written] | {'train': 26000} | {'train': {'number_of_characters': 935841, 'num_samples': 26000, 'num_queries': 13000, 'num_documents': 13000, 'min_document_length': 18, 'average_document_length': 70.99, 'max_document_length': 2532, 'unique_documents': 13000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 13000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 13000, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 70519, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 21, 'average_document_length': 69.52, 'max_document_length': 1811, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'javascript': {'number_of_characters': 57880, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 18, 'average_document_length': 56.88, 'max_document_length': 601, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'typescript': {'number_of_characters': 61092, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 60.09, 'max_document_length': 659, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'go': {'number_of_characters': 71797, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 70.8, 'max_document_length': 1529, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'ruby': {'number_of_characters': 67900, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 20, 'average_document_length': 66.9, 'max_document_length': 751, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'java': {'number_of_characters': 63984, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 23, 'average_document_length': 62.98, 'max_document_length': 807, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'php': {'number_of_characters': 62927, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 21, 'average_document_length': 61.93, 'max_document_length': 766, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'c': {'number_of_characters': 98588, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 20, 'average_document_length': 97.59, 'max_document_length': 1672, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'c++': {'number_of_characters': 115480, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 22, 'average_document_length': 114.48, 'max_document_length': 1856, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'rust': {'number_of_characters': 68503, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 67.5, 'max_document_length': 2532, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'swift': {'number_of_characters': 58279, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 57.28, 'max_document_length': 727, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'scala': {'number_of_characters': 65833, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 22, 'average_document_length': 64.83, 'max_document_length': 685, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'shell': {'number_of_characters': 73059, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 18, 'average_document_length': 72.06, 'max_document_length': 813, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}}}} | | [CodeFeedbackMT](https://arxiv.org/abs/2402.14658) (Tianyu Zheng, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 79660} | {'test': {'number_of_characters': 156266302, 'num_samples': 79660, 'num_queries': 13277, 'num_documents': 66383, 'min_document_length': 127, 'average_document_length': 885.13, 'max_document_length': 32432, 'unique_documents': 66383, 'min_query_length': 2, 'average_query_length': 7344.18, 'max_query_length': 9403, 'unique_queries': 13277, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 13277}} | | [CodeFeedbackST](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 187832} | {'test': {'number_of_characters': 260957682, 'num_samples': 187832, 'num_queries': 31306, 'num_documents': 156526, 'min_document_length': 26, 'average_document_length': 144.85, 'max_document_length': 13851, 'unique_documents': 156526, 'min_query_length': 1, 'average_query_length': 7611.46, 'max_query_length': 11354, 'unique_queries': 31306, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 31306}} | +| [CodeRAGLibraryDocumentationSolutions](https://arxiv.org/pdf/2406.14497) (Zora Zhiruo Wang, 2024) | ['python'] | Reranking | s2s | [Programming] | None | None | +| [CodeRAGOnlineTutorials](https://arxiv.org/pdf/2406.14497) (Zora Zhiruo Wang, 2024) | ['python'] | Reranking | s2s | [Programming] | None | None | +| [CodeRAGProgrammingSolutions](https://arxiv.org/pdf/2406.14497) (Zora Zhiruo Wang, 2024) | ['python'] | Reranking | s2s | [Programming] | None | None | +| [CodeRAGStackoverflowPosts](https://arxiv.org/pdf/2406.14497) (Zora Zhiruo Wang, 2024) | ['python'] | Reranking | s2s | [Programming] | None | None | | [CodeSearchNetCCRetrieval](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['go', 'java', 'javascript', 'php', 'python', 'ruby'] | Retrieval | p2p | [Programming, Written] | {'test': 1058035} | {'test': {'number_of_characters': 22407915, 'num_samples': 1058035, 'num_queries': 52561, 'num_documents': 1005474, 'min_document_length': 23, 'average_document_length': 20.29, 'max_document_length': 214210, 'unique_documents': 1005474, 'min_query_length': 2, 'average_query_length': 38.26, 'max_query_length': 2, 'unique_queries': 52561, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 52561, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 8792958, 'num_samples': 295570, 'num_queries': 14918, 'num_documents': 280652, 'min_document_length': 38, 'average_document_length': 29.33, 'max_document_length': 8326, 'unique_documents': 280652, 'min_query_length': 2, 'average_query_length': 37.63, 'max_query_length': 2, 'unique_queries': 14918, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 14918}, 'javascript': {'number_of_characters': 1590642, 'num_samples': 68492, 'num_queries': 3291, 'num_documents': 65201, 'min_document_length': 40, 'average_document_length': 22.4, 'max_document_length': 214210, 'unique_documents': 65201, 'min_query_length': 2, 'average_query_length': 39.62, 'max_query_length': 2, 'unique_queries': 3291, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3291}, 'go': {'number_of_characters': 2264134, 'num_samples': 190857, 'num_queries': 8122, 'num_documents': 182735, 'min_document_length': 23, 'average_document_length': 10.39, 'max_document_length': 3589, 'unique_documents': 182735, 'min_query_length': 2, 'average_query_length': 45.0, 'max_query_length': 2, 'unique_queries': 8122, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 8122}, 'ruby': {'number_of_characters': 391703, 'num_samples': 28849, 'num_queries': 1261, 'num_documents': 27588, 'min_document_length': 36, 'average_document_length': 12.2, 'max_document_length': 2244, 'unique_documents': 27588, 'min_query_length': 2, 'average_query_length': 43.76, 'max_query_length': 2, 'unique_queries': 1261, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1261}, 'java': {'number_of_characters': 4114584, 'num_samples': 192016, 'num_queries': 10955, 'num_documents': 181061, 'min_document_length': 38, 'average_document_length': 20.72, 'max_document_length': 5066, 'unique_documents': 181061, 'min_query_length': 2, 'average_query_length': 33.06, 'max_query_length': 2, 'unique_queries': 10955, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10955}, 'php': {'number_of_characters': 5253894, 'num_samples': 282251, 'num_queries': 14014, 'num_documents': 268237, 'min_document_length': 40, 'average_document_length': 17.59, 'max_document_length': 2995, 'unique_documents': 268237, 'min_query_length': 2, 'average_query_length': 38.28, 'max_query_length': 2, 'unique_queries': 14014, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 14014}}}} | | [CodeSearchNetRetrieval](https://huggingface.co/datasets/code_search_net/) (Husain et al., 2019) | ['go', 'java', 'javascript', 'php', 'python', 'ruby'] | Retrieval | p2p | [Programming, Written] | {'test': 12000} | {'test': {'number_of_characters': 1950074, 'num_samples': 12000, 'num_queries': 6000, 'num_documents': 6000, 'min_document_length': 2, 'average_document_length': 324.01, 'max_document_length': 17533, 'unique_documents': 6000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 6000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 6000, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 467546, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 8, 'average_document_length': 466.55, 'max_document_length': 8636, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'javascript': {'number_of_characters': 187018, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 2, 'average_document_length': 186.02, 'max_document_length': 7657, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'go': {'number_of_characters': 126213, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 14, 'average_document_length': 125.21, 'max_document_length': 1501, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'ruby': {'number_of_characters': 314818, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 5, 'average_document_length': 313.82, 'max_document_length': 17533, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'java': {'number_of_characters': 691360, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 2, 'average_document_length': 690.36, 'max_document_length': 6473, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'php': {'number_of_characters': 163119, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 5, 'average_document_length': 162.12, 'max_document_length': 1240, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}}}} | | [CodeTransOceanContest](https://arxiv.org/abs/2310.04951) (Weixiang Yan, 2023) | ['c++', 'python'] | Retrieval | p2p | [Programming, Written] | {'test': 1229} | {'test': {'number_of_characters': 1744286, 'num_samples': 1229, 'num_queries': 221, 'num_documents': 1008, 'min_document_length': 8, 'average_document_length': 221.9, 'max_document_length': 4147, 'unique_documents': 1008, 'min_query_length': 8, 'average_query_length': 6880.58, 'max_query_length': 10852, 'unique_queries': 221, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 221}} | @@ -350,6 +200,8 @@ The following tables give you an overview of the tasks in MTEB. | [Core17InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionRetrieval | s2p | [News, Written] | {'test': 19919} | {'test': {'num_samples': 19919, 'num_docs': 19899, 'num_queries': 20, 'number_of_characters': 44450333, 'min_document_length': 7, 'average_document_length': 2233.03, 'max_document_length': 2959, 'unique_docs': 19143, 'min_query_length': 55, 'average_query_length': 109.75, 'max_query_length': 278, 'unique_queries': 20, 'min_instruction_length': 102, 'average_instruction_length': 295.55, 'max_instruction_length': 811, 'unique_instructions': 20, 'min_changed_instruction_length': 151, 'average_changed_instruction_length': 355.2, 'max_changed_instruction_length': 837, 'unique_changed_instructions': 20, 'min_average_relevant_docs_per_query': 4, 'average_relevant_docs_per_query': 32.7, 'max_average_relevant_docs_per_query': 55, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}} | | [CorporateLobbyingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [CosQA](https://arxiv.org/abs/2105.13239) (Junjie Huang, 2021) | ['eng', 'python'] | Retrieval | p2p | [Programming, Written] | {'test': 21104} | {'test': {'number_of_characters': 5728450, 'num_samples': 21104, 'num_queries': 500, 'num_documents': 20604, 'min_document_length': 18, 'average_document_length': 0.89, 'max_document_length': 83, 'unique_documents': 20604, 'min_query_length': 88, 'average_query_length': 11420.09, 'max_query_length': 6396, 'unique_queries': 500, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 500}} | +| [Country211](https://huggingface.co/datasets/clip-benchmark/wds_country211) (Radford et al., 2021) | ['eng'] | ImageClassification | i2i | [Scene] | None | None | +| [Country211ZeroShot](https://huggingface.co/datasets/clip-benchmark/wds_country211) (Radford et al., 2021) | ['eng'] | ZeroShotClassification | i2t | [Scene] | None | None | | [CovidRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | None | | [CrossLingualSemanticDiscriminationWMT19](https://huggingface.co/datasets/Andrianos/clsd_wmt19_21) | ['deu', 'fra'] | Retrieval | s2s | [News, Written] | None | None | | [CrossLingualSemanticDiscriminationWMT21](https://huggingface.co/datasets/Andrianos/clsd_wmt19_21) | ['deu', 'fra'] | Retrieval | s2s | [News, Written] | None | None | @@ -357,17 +209,23 @@ The following tables give you an overview of the tasks in MTEB. | [CzechProductReviewSentimentClassification](https://aclanthology.org/W13-1609/) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | | [CzechSoMeSentimentClassification](https://aclanthology.org/W13-1609/) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | | [CzechSubjectivityClassification](https://arxiv.org/abs/2009.08712) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | -| [DBPedia](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Written, Encyclopaedic] | None | None | -| [DBPedia-PL](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['pol'] | Retrieval | s2p | [Written, Encyclopaedic] | None | None | -| [DBPedia-PLHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['pol'] | Retrieval | s2p | [Written, Encyclopaedic] | None | None | -| [DBPediaHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Written, Encyclopaedic] | None | None | +| [DBPedia](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [DBPedia-Fa](https://huggingface.co/datasets/MCINext/dbpedia-fa) | ['fas'] | Retrieval | s2p | [Encyclopaedic] | None | None | +| [DBPedia-PL](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['pol'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [DBPedia-PLHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['pol'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [DBPediaHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [DBpediaClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Encyclopaedic, Written] | None | None | | [DKHateClassification](https://aclanthology.org/2020.lrec-1.430/) | ['dan'] | Classification | s2s | [Social, Written] | None | None | +| [DTD](https://www.robots.ox.ac.uk/~vgg/data/dtd/) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [DTDZeroShot](https://www.robots.ox.ac.uk/~vgg/data/dtd/) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | | [DalajClassification](https://spraakbanken.gu.se/en/resources/superlim) | ['swe'] | Classification | s2s | [Non-fiction, Written] | None | None | | [DanFeverRetrieval](https://aclanthology.org/2021.nodalida-main.47/) | ['dan'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Spoken] | None | None | | [DanishPoliticalCommentsClassification](https://huggingface.co/datasets/danish_political_comments) (Mads Guldborg Kjeldgaard Kongsbak, 2019) | ['dan'] | Classification | s2s | [Social, Written] | None | None | +| [DeepSentiPers](https://github.com/JoyeBright/DeepSentiPers) | ['fas'] | Classification | s2s | [Reviews] | None | None | | [DefinitionClassificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [DiaBlaBitextMining](https://inria.hal.science/hal-03021633) (Gonzรกlez et al., 2019) | ['eng', 'fra'] | BitextMining | s2s | [Social, Written] | None | None | +| [DigikalamagClassification](https://hooshvare.github.io/docs/datasets/tc) | ['fas'] | Classification | p2p | [Web] | None | None | +| [DigikalamagClustering](https://hooshvare.github.io/docs/datasets/tc) | ['fas'] | Clustering | p2p | [Web] | None | None | | [Diversity1LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [Diversity2LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [Diversity3LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | @@ -376,34 +234,57 @@ The following tables give you an overview of the tasks in MTEB. | [Diversity6LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [DuRetrieval](https://aclanthology.org/2022.emnlp-main.357.pdf) (Yifu Qiu, 2022) | ['cmn'] | Retrieval | s2p | | None | None | | [DutchBookReviewSentimentClassification](https://github.com/benjaminvdb/DBRD) (Benjamin et al., 2019) | ['nld'] | Classification | s2s | [Reviews, Written] | None | None | +| [EDIST2ITRetrieval](https://aclanthology.org/2023.emnlp-main.297/) (Liu et al., 2023) | ['eng'] | Any2AnyRetrieval | t2it | [News] | None | None | | [ESCIReranking](https://github.com/amazon-science/esci-data/) (Chandan K. Reddy, 2022) | ['eng', 'jpn', 'spa'] | Reranking | s2p | [Written] | {'test': 29285} | {'test': {'num_samples': 29285, 'number_of_characters': 254538331, 'num_positive': 271416, 'num_negative': 44235, 'min_query_length': 1, 'avg_query_length': 19.69, 'max_query_length': 151, 'unique_query': 29269, 'min_positive_length': 1, 'avg_positive_length': 803.92, 'max_positive_length': 8640, 'unique_positive': 217712, 'min_negative_length': 1, 'avg_negative_length': 808.5, 'max_negative_length': 4441, 'unique_negative': 39551, 'hf_subset_descriptive_stats': {'us': {'num_samples': 21296, 'number_of_characters': 186915609, 'num_positive': 189375, 'num_negative': 25463, 'min_query_length': 1, 'avg_query_length': 21.44, 'max_query_length': 151, 'unique_query': 21296, 'min_positive_length': 1, 'avg_positive_length': 868.37, 'max_positive_length': 5545, 'unique_positive': 150734, 'min_negative_length': 1, 'avg_negative_length': 864.45, 'max_negative_length': 3779, 'unique_negative': 23073}, 'es': {'num_samples': 3703, 'number_of_characters': 48861389, 'num_positive': 39110, 'num_negative': 10183, 'min_query_length': 3, 'avg_query_length': 20.68, 'max_query_length': 59, 'unique_query': 3703, 'min_positive_length': 1, 'avg_positive_length': 980.96, 'max_positive_length': 8640, 'unique_positive': 32921, 'min_negative_length': 1, 'avg_negative_length': 1023.22, 'max_negative_length': 4441, 'unique_negative': 9285}, 'jp': {'num_samples': 4286, 'number_of_characters': 18761333, 'num_positive': 42931, 'num_negative': 8589, 'min_query_length': 1, 'avg_query_length': 10.15, 'max_query_length': 60, 'unique_query': 4286, 'min_positive_length': 1, 'avg_positive_length': 358.36, 'max_positive_length': 3488, 'unique_positive': 35165, 'min_negative_length': 1, 'avg_negative_length': 388.08, 'max_negative_length': 3940, 'unique_negative': 7289}}}} | | [EcomRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | None | | [EightTagsClustering.v2](https://aclanthology.org/2020.lrec-1.207.pdf) | ['pol'] | Clustering | s2s | [Social, Written] | None | None | | [EmotionClassification](https://www.aclweb.org/anthology/D18-1404) | ['eng'] | Classification | s2s | [Social, Written] | None | None | +| [EncyclopediaVQAIT2ITRetrieval](https://github.com/google-research/google-research/tree/master/encyclopedic_vqa) (Mensink et al., 2023) | ['eng'] | Any2AnyRetrieval | it2it | [Encyclopaedic] | None | None | | [EstQA](https://www.semanticscholar.org/paper/Extractive-Question-Answering-for-Estonian-Language-182912IAPM-Alum%C3%A4e/ea4f60ab36cadca059c880678bc4c51e293a85d6?utm_source=direct_link) | ['est'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [EstonianValenceClassification](https://figshare.com/articles/dataset/Estonian_Valence_Corpus_Eesti_valentsikorpus/24517054) | ['est'] | Classification | s2s | [News, Written] | None | None | -| [FEVER](https://fever.ai/) | ['eng'] | Retrieval | s2p | | None | None | +| [EuroSAT](https://ieeexplore.ieee.org/document/8736785) (Helber et al., 2019) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [EuroSATZeroShot](https://ieeexplore.ieee.org/document/8736785) (Helber et al., 2019) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | +| [FER2013](https://arxiv.org/abs/1412.6572) (Ian J. Goodfellow, 2015) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [FER2013ZeroShot](https://arxiv.org/abs/1412.6572) (Ian J. Goodfellow, 2015) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | +| [FEVER](https://fever.ai/) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [FEVERHardNegatives](https://fever.ai/) | ['eng'] | Retrieval | s2p | | None | None | +| [FGVCAircraft](https://arxiv.org/abs/1306.5151) (Subhransu Maji, 2013) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [FGVCAircraftZeroShot](https://arxiv.org/abs/1306.5151) (Subhransu Maji, 2013) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | +| [FORBI2IRetrieval](https://github.com/pxiangwu/FORB) (Pengxiang Wu, 2023) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | None | None | | [FQuADRetrieval](https://huggingface.co/datasets/manu/fquad2_test) | ['fra'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [FaithDial](https://mcgill-nlp.github.io/FaithDial) (Dziri et al., 2022) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [FalseFriendsGermanEnglish](https://drive.google.com/file/d/1jgq0nBnV-UiYNxbKNrrr2gxDEHm-DMKH/view?usp=share_link) | ['deu'] | PairClassification | s2s | [Written] | None | None | | [FaroeseSTS](https://aclanthology.org/2023.nodalida-1.74.pdf) | ['fao'] | STS | s2s | [News, Web, Written] | None | None | | [FarsTail](https://link.springer.com/article/10.1007/s00500-023-08959-3) (Amirkhani et al., 2023) | ['fas'] | PairClassification | s2s | [Academic, Written] | None | None | -| [FeedbackQARetrieval](https://arxiv.org/abs/2204.03025) | ['eng'] | Retrieval | s2p | [Web, Government, Medical, Written] | None | None | -| [FiQA-PL](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['pol'] | Retrieval | s2p | | None | None | -| [FiQA2018](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['eng'] | Retrieval | s2p | | None | None | +| [FarsiParaphraseDetection](https://huggingface.co/datasets/alighasemi/farsi_paraphrase_detection) | ['fas'] | PairClassification | s2s | | None | None | +| [Farsick](https://github.com/ZahraGhasemi-AI/FarSick) | ['fas'] | STS | s2s | | None | None | +| [Fashion200kI2TRetrieval](https://openaccess.thecvf.com/content_iccv_2017/html/Han_Automatic_Spatially-Aware_Fashion_ICCV_2017_paper.html) (Han et al., 2017) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | None | None | +| [Fashion200kT2IRetrieval](https://openaccess.thecvf.com/content_iccv_2017/html/Han_Automatic_Spatially-Aware_Fashion_ICCV_2017_paper.html) (Han et al., 2017) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | None | None | +| [FashionIQIT2IRetrieval](https://openaccess.thecvf.com/content/CVPR2021/html/Wu_Fashion_IQ_A_New_Dataset_Towards_Retrieving_Images_by_Natural_CVPR_2021_paper.html) (Wu et al., 2021) | ['eng'] | Any2AnyRetrieval | it2i | [Encyclopaedic] | None | None | +| [FeedbackQARetrieval](https://arxiv.org/abs/2204.03025) | ['eng'] | Retrieval | s2p | [Government, Medical, Web, Written] | None | None | +| [FiQA-PL](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['pol'] | Retrieval | s2p | [Financial, Written] | None | None | +| [FiQA2018](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['eng'] | Retrieval | s2p | [Financial, Written] | None | None | +| [FiQA2018-Fa](https://huggingface.co/datasets/MCINext/fiqa-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [FilipinoHateSpeechClassification](https://pcj.csp.org.ph/index.php/pcj/issue/download/29/PCJ%20V14%20N1%20pp1-14%202019) (Neil Vicente Cabasag et al., 2019) | ['fil'] | Classification | s2s | [Social, Written] | None | None | | [FilipinoShopeeReviewsClassification](https://uijrt.com/articles/v4/i8/UIJRTV4I80009.pdf) | ['fil'] | Classification | s2s | [Social, Written] | None | None | | [FinParaSTS](https://huggingface.co/datasets/TurkuNLP/turku_paraphrase_corpus) | ['fin'] | STS | s2s | [News, Subtitles, Written] | None | None | | [FinToxicityClassification](https://aclanthology.org/2023.nodalida-1.68) | ['fin'] | Classification | s2s | [News, Written] | None | None | -| [FinancialPhrasebankClassification](https://arxiv.org/abs/1307.5336) (P. Malo, 2014) | ['eng'] | Classification | s2s | [News, Written] | None | None | -| [FloresBitextMining](https://huggingface.co/datasets/facebook/flores) (Goyal et al., 2022) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | BitextMining | s2s | [Non-fiction, Encyclopaedic, Written] | None | None | +| [FinancialPhrasebankClassification](https://arxiv.org/abs/1307.5336) (P. Malo, 2014) | ['eng'] | Classification | s2s | [Financial, News, Written] | None | None | +| [Flickr30kI2TRetrieval](https://www.semanticscholar.org/paper/From-image-descriptions-to-visual-denotations%3A-New-Young-Lai/44040913380206991b1991daf1192942e038fe31) (Peter Young, 2014) | ['eng'] | Any2AnyRetrieval | i2t | [Web, Written] | None | None | +| [Flickr30kT2IRetrieval](https://www.semanticscholar.org/paper/From-image-descriptions-to-visual-denotations%3A-New-Young-Lai/44040913380206991b1991daf1192942e038fe31) (Peter Young, 2014) | ['eng'] | Any2AnyRetrieval | t2i | [Web, Written] | None | None | +| [FloresBitextMining](https://huggingface.co/datasets/facebook/flores) (Goyal et al., 2022) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | BitextMining | s2s | [Encyclopaedic, Non-fiction, Written] | None | None | +| [Food101Classification](https://huggingface.co/datasets/ethz/food101) (Bossard et al., 2014) | ['eng'] | ImageClassification | i2i | [Web] | None | None | +| [Food101ZeroShot](https://huggingface.co/datasets/ethz/food101) (Bossard et al., 2014) | ['eng'] | ZeroShotClassification | i2t | [Web] | None | None | | [FrenchBookReviews](https://huggingface.co/datasets/Abirate/french_book_reviews) | ['fra'] | Classification | s2s | [Reviews, Written] | None | None | | [FrenkEnClassification](https://arxiv.org/abs/1906.02045) (Nikola Ljubeลกiฤ‡, 2019) | ['eng'] | Classification | s2s | [Social, Written] | None | None | | [FrenkHrClassification](https://arxiv.org/abs/1906.02045) (Nikola Ljubeลกiฤ‡, 2019) | ['hrv'] | Classification | s2s | [Social, Written] | None | None | | [FrenkSlClassification](https://arxiv.org/pdf/1906.02045) (Nikola Ljubeลกiฤ‡, 2019) | ['slv'] | Classification | s2s | [Social, Written] | None | None | | [FunctionOfDecisionSectionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [GLDv2I2IRetrieval](https://openaccess.thecvf.com/content_CVPR_2020/html/Weyand_Google_Landmarks_Dataset_v2_-_A_Large-Scale_Benchmark_for_Instance-Level_CVPR_2020_paper.html) (Weyand et al., 2020) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | None | None | +| [GLDv2I2TRetrieval](https://openaccess.thecvf.com/content_CVPR_2020/html/Weyand_Google_Landmarks_Dataset_v2_-_A_Large-Scale_Benchmark_for_Instance-Level_CVPR_2020_paper.html) (Weyand et al., 2020) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | None | None | | [GPUSpeedTask](https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/blob/c8376f967d1294419be1d3eb41217d04cd3a65d3/src/seb/registered_tasks/speed.py#L83-L96) | ['eng'] | Speed | s2s | [Fiction, Written] | None | None | +| [GTSRB](https://benchmark.ini.rub.de/) (Stallkamp et al., 2011) | ['eng'] | ImageClassification | i2i | [Scene] | None | None | +| [GTSRBZeroShot](https://benchmark.ini.rub.de/) (Stallkamp et al., 2011) | ['eng'] | ZeroShotClassification | i2t | [Scene] | None | None | | [GeoreviewClassification](https://github.com/yandex/geo-reviews-dataset-2023) | ['rus'] | Classification | p2p | [Reviews, Written] | None | None | | [GeoreviewClusteringP2P](https://github.com/yandex/geo-reviews-dataset-2023) | ['rus'] | Clustering | p2p | [Reviews, Written] | None | None | | [GeorgianFAQRetrieval](https://huggingface.co/datasets/jupyterjazz/georgian-faq) | ['kat'] | Retrieval | s2p | [Web, Written] | None | None | @@ -411,7 +292,7 @@ The following tables give you an overview of the tasks in MTEB. | [GerDaLIRSmall](https://github.com/lavis-nlp/GerDaLIR) | ['deu'] | Retrieval | p2p | [Legal, Written] | None | None | | [GermanDPR](https://huggingface.co/datasets/deepset/germandpr) (Timo Mรถller, 2021) | ['deu'] | Retrieval | s2p | | None | None | | [GermanGovServiceRetrieval](https://huggingface.co/datasets/it-at-m/LHM-Dienstleistungen-QA) | ['deu'] | Retrieval | s2p | [Government, Written] | None | None | -| [GermanPoliticiansTwitterSentimentClassification](https://aclanthology.org/2022.konvens-1.9) | ['deu'] | Classification | s2s | [Social, Government, Written] | None | None | +| [GermanPoliticiansTwitterSentimentClassification](https://aclanthology.org/2022.konvens-1.9) | ['deu'] | Classification | s2s | [Government, Social, Written] | None | None | | [GermanQuAD-Retrieval](https://www.kaggle.com/datasets/GermanQuAD) (Timo Mรถller, 2021) | ['deu'] | Retrieval | s2p | | None | None | | [GermanSTSBenchmark](https://github.com/t-systems-on-site-services-gmbh/german-STSbenchmark) (Philip May, 2021) | ['deu'] | STS | s2s | | None | None | | [GreekCivicsQA](https://huggingface.co/datasets/antoinelb7/alloprof) | ['ell'] | Retrieval | s2p | [Academic, Written] | None | None | @@ -419,7 +300,10 @@ The following tables give you an overview of the tasks in MTEB. | [GujaratiNewsClassification](https://github.com/goru001/nlp-for-gujarati) | ['guj'] | Classification | s2s | [News, Written] | None | None | | [HALClusteringS2S.v2](https://huggingface.co/datasets/lyon-nlp/clustering-hal-s2s) (Mathieu Ciancone, 2024) | ['fra'] | Clustering | s2s | [Academic, Written] | None | None | | [HagridRetrieval](https://github.com/project-miracl/hagrid) (Ehsan Kamalloo, 2023) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [HamshahriClustring](https://github.com/mallahyari/Farsi-datasets) | ['fas'] | Clustering | p2p | [News] | None | None | | [HateSpeechPortugueseClassification](https://aclanthology.org/W19-3510) | ['por'] | Classification | s2s | [Social, Written] | None | None | +| [HatefulMemesI2TRetrieval](https://arxiv.org/pdf/2005.04790) (Kiela et al., 2020) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | None | None | +| [HatefulMemesT2IRetrieval](https://arxiv.org/pdf/2005.04790) (Kiela et al., 2020) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | None | None | | [HeadlineClassification](https://aclanthology.org/2020.ngt-1.6/) | ['rus'] | Classification | s2s | [News, Written] | None | None | | [HebrewSentimentAnalysis](https://huggingface.co/datasets/hebrew_sentiment) | ['heb'] | Classification | s2s | [Reviews, Written] | None | None | | [HellaSwag](https://rowanzellers.com/hellaswag/) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | @@ -427,115 +311,54 @@ The following tables give you an overview of the tasks in MTEB. | [HindiDiscourseClassification](https://aclanthology.org/2020.lrec-1.149/) | ['hin'] | Classification | s2s | [Fiction, Social, Written] | None | None | | [HotelReviewSentimentClassification](https://link.springer.com/chapter/10.1007/978-3-319-67056-0_3) (Elnagar et al., 2018) | ['ara'] | Classification | s2s | [Reviews, Written] | None | None | | [HotpotQA](https://hotpotqa.github.io/) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | +| [HotpotQA-Fa](https://huggingface.co/datasets/MCINext/hotpotqa-fa) | ['fas'] | Retrieval | s2p | [Encyclopaedic] | None | None | | [HotpotQA-PL](https://hotpotqa.github.io/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | None | | [HotpotQA-PLHardNegatives](https://hotpotqa.github.io/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | None | | [HotpotQAHardNegatives](https://hotpotqa.github.io/) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | | [HunSum2AbstractiveRetrieval](https://arxiv.org/abs/2404.03555) (Botond Barta, 2024) | ['hun'] | Retrieval | s2p | [News, Written] | None | None | ->>>>>>> main | [IFlyTek](https://www.cluebenchmarks.com/introduce.html) | ['cmn'] | Classification | s2s | | None | None | -| [IN22ConvBitextMining](https://huggingface.co/datasets/ai4bharat/IN22-Conv) (Jay Gala, 2023) | ['asm', 'ben', 'brx', 'doi', 'eng', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | BitextMining | s2s | [Social, Spoken, Fiction, Spoken] | {'test': 760518} | {'test': {'num_samples': 760518, 'number_of_characters': 82637104, 'unique_pairs': 759283, 'min_sentence1_length': 3, 'average_sentence1_length': 54.33, 'max_sentence1_length': 239, 'unique_sentence1': 34430, 'min_sentence2_length': 3, 'average_sentence2_length': 54.33, 'max_sentence2_length': 239, 'unique_sentence2': 34430, 'hf_subset_descriptive_stats': {'asm_Beng-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155988, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'asm_Beng-brx_Deva': {'num_samples': 1503, 'number_of_characters': 162044, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'asm_Beng-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167032, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'asm_Beng-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160716, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'asm_Beng-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156282, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'asm_Beng-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 158269, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'asm_Beng-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159964, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'asm_Beng-kan_Knda': {'num_samples': 1503, 'number_of_characters': 165177, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'asm_Beng-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164681, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'asm_Beng-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162408, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'asm_Beng-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172838, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'asm_Beng-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162747, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'asm_Beng-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157316, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'asm_Beng-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160906, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'asm_Beng-ory_Orya': {'num_samples': 1503, 'number_of_characters': 164223, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'asm_Beng-pan_Guru': {'num_samples': 1503, 'number_of_characters': 160201, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'asm_Beng-san_Deva': {'num_samples': 1503, 'number_of_characters': 158093, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'asm_Beng-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169379, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'asm_Beng-snd_Deva': {'num_samples': 1503, 'number_of_characters': 162623, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'asm_Beng-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174866, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'asm_Beng-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157690, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'asm_Beng-urd_Arab': {'num_samples': 1503, 'number_of_characters': 161305, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'ben_Beng-asm_Beng': {'num_samples': 1503, 'number_of_characters': 155988, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'ben_Beng-brx_Deva': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'ben_Beng-doi_Deva': {'num_samples': 1503, 'number_of_characters': 161436, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'ben_Beng-eng_Latn': {'num_samples': 1503, 'number_of_characters': 155120, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'ben_Beng-gom_Deva': {'num_samples': 1503, 'number_of_characters': 150686, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'ben_Beng-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 152673, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'ben_Beng-hin_Deva': {'num_samples': 1503, 'number_of_characters': 154368, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'ben_Beng-kan_Knda': {'num_samples': 1503, 'number_of_characters': 159581, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'ben_Beng-kas_Arab': {'num_samples': 1503, 'number_of_characters': 159085, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'ben_Beng-mai_Deva': {'num_samples': 1503, 'number_of_characters': 156812, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'ben_Beng-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 167242, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'ben_Beng-mar_Deva': {'num_samples': 1503, 'number_of_characters': 157151, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'ben_Beng-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 151720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'ben_Beng-npi_Deva': {'num_samples': 1503, 'number_of_characters': 155310, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'ben_Beng-ory_Orya': {'num_samples': 1503, 'number_of_characters': 158627, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'ben_Beng-pan_Guru': {'num_samples': 1503, 'number_of_characters': 154605, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'ben_Beng-san_Deva': {'num_samples': 1503, 'number_of_characters': 152497, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'ben_Beng-sat_Olck': {'num_samples': 1503, 'number_of_characters': 163783, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'ben_Beng-snd_Deva': {'num_samples': 1503, 'number_of_characters': 157027, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'ben_Beng-tam_Taml': {'num_samples': 1503, 'number_of_characters': 169270, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'ben_Beng-tel_Telu': {'num_samples': 1503, 'number_of_characters': 152094, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'ben_Beng-urd_Arab': {'num_samples': 1503, 'number_of_characters': 155709, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'brx_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162044, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'brx_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'brx_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167492, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'brx_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161176, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'brx_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156742, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'brx_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'brx_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 160424, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'brx_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 165637, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'brx_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165141, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'brx_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162868, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'brx_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'brx_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163207, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'brx_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157776, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'brx_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'brx_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 164683, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'brx_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 160661, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'brx_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 158553, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'brx_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169839, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'brx_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163083, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'brx_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175326, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'brx_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158150, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'brx_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 161765, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'doi_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 167032, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'doi_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 161436, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'doi_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 167492, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'doi_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 166164, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'doi_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'doi_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 163717, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'doi_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 165412, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'doi_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 170625, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'doi_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 170129, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'doi_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 167856, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'doi_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 178286, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'doi_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 168195, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'doi_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 162764, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'doi_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 166354, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'doi_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 169671, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'doi_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 165649, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'doi_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 163541, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'doi_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 174827, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'doi_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 168071, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'doi_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 180314, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'doi_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 163138, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'doi_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 166753, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'eng_Latn-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160716, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'eng_Latn-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155120, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'eng_Latn-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161176, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'eng_Latn-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166164, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'eng_Latn-gom_Deva': {'num_samples': 1503, 'number_of_characters': 155414, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'eng_Latn-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157401, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'eng_Latn-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159096, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'eng_Latn-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164309, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'eng_Latn-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163813, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'eng_Latn-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161540, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'eng_Latn-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171970, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'eng_Latn-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161879, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'eng_Latn-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'eng_Latn-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160038, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'eng_Latn-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163355, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'eng_Latn-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159333, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'eng_Latn-san_Deva': {'num_samples': 1503, 'number_of_characters': 157225, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'eng_Latn-sat_Olck': {'num_samples': 1503, 'number_of_characters': 168511, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'eng_Latn-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161755, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'eng_Latn-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173998, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'eng_Latn-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156822, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'eng_Latn-urd_Arab': {'num_samples': 1503, 'number_of_characters': 160437, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'gom_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 156282, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'gom_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 150686, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'gom_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 156742, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'gom_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'gom_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 155414, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'gom_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 152967, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'gom_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 154662, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'gom_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 159875, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'gom_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 159379, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'gom_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 157106, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'gom_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 167536, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'gom_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 157445, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'gom_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 152014, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'gom_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 155604, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'gom_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 158921, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'gom_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 154899, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'gom_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 152791, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'gom_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 164077, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'gom_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 157321, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'gom_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 169564, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'gom_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 152388, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'gom_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 156003, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'guj_Gujr-asm_Beng': {'num_samples': 1503, 'number_of_characters': 158269, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'guj_Gujr-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152673, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'guj_Gujr-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'guj_Gujr-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163717, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'guj_Gujr-eng_Latn': {'num_samples': 1503, 'number_of_characters': 157401, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'guj_Gujr-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152967, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'guj_Gujr-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156649, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'guj_Gujr-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161862, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'guj_Gujr-kas_Arab': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'guj_Gujr-mai_Deva': {'num_samples': 1503, 'number_of_characters': 159093, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'guj_Gujr-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 169523, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'guj_Gujr-mar_Deva': {'num_samples': 1503, 'number_of_characters': 159432, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'guj_Gujr-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 154001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'guj_Gujr-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157591, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'guj_Gujr-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160908, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'guj_Gujr-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156886, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'guj_Gujr-san_Deva': {'num_samples': 1503, 'number_of_characters': 154778, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'guj_Gujr-sat_Olck': {'num_samples': 1503, 'number_of_characters': 166064, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'guj_Gujr-snd_Deva': {'num_samples': 1503, 'number_of_characters': 159308, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'guj_Gujr-tam_Taml': {'num_samples': 1503, 'number_of_characters': 171551, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'guj_Gujr-tel_Telu': {'num_samples': 1503, 'number_of_characters': 154375, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'guj_Gujr-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157990, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'hin_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 159964, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'hin_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 154368, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'hin_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 160424, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'hin_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 165412, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'hin_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 159096, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'hin_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 154662, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'hin_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 156649, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'hin_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 163557, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'hin_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163061, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'hin_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 160788, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'hin_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171218, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'hin_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161127, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'hin_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 155696, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'hin_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 159286, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'hin_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 162603, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'hin_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 158581, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'hin_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 156473, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'hin_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 167759, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'hin_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'hin_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173246, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'hin_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156070, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'hin_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 159685, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'kan_Knda-asm_Beng': {'num_samples': 1503, 'number_of_characters': 165177, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'kan_Knda-ben_Beng': {'num_samples': 1503, 'number_of_characters': 159581, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'kan_Knda-brx_Deva': {'num_samples': 1503, 'number_of_characters': 165637, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'kan_Knda-doi_Deva': {'num_samples': 1503, 'number_of_characters': 170625, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'kan_Knda-eng_Latn': {'num_samples': 1503, 'number_of_characters': 164309, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'kan_Knda-gom_Deva': {'num_samples': 1503, 'number_of_characters': 159875, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'kan_Knda-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 161862, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'kan_Knda-hin_Deva': {'num_samples': 1503, 'number_of_characters': 163557, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'kan_Knda-kas_Arab': {'num_samples': 1503, 'number_of_characters': 168274, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'kan_Knda-mai_Deva': {'num_samples': 1503, 'number_of_characters': 166001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'kan_Knda-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 176431, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'kan_Knda-mar_Deva': {'num_samples': 1503, 'number_of_characters': 166340, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'kan_Knda-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 160909, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'kan_Knda-npi_Deva': {'num_samples': 1503, 'number_of_characters': 164499, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'kan_Knda-ory_Orya': {'num_samples': 1503, 'number_of_characters': 167816, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'kan_Knda-pan_Guru': {'num_samples': 1503, 'number_of_characters': 163794, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'kan_Knda-san_Deva': {'num_samples': 1503, 'number_of_characters': 161686, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'kan_Knda-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172972, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'kan_Knda-snd_Deva': {'num_samples': 1503, 'number_of_characters': 166216, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'kan_Knda-tam_Taml': {'num_samples': 1503, 'number_of_characters': 178459, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'kan_Knda-tel_Telu': {'num_samples': 1503, 'number_of_characters': 161283, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'kan_Knda-urd_Arab': {'num_samples': 1503, 'number_of_characters': 164898, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'kas_Arab-asm_Beng': {'num_samples': 1503, 'number_of_characters': 164681, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'kas_Arab-ben_Beng': {'num_samples': 1503, 'number_of_characters': 159085, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'kas_Arab-brx_Deva': {'num_samples': 1503, 'number_of_characters': 165141, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'kas_Arab-doi_Deva': {'num_samples': 1503, 'number_of_characters': 170129, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'kas_Arab-eng_Latn': {'num_samples': 1503, 'number_of_characters': 163813, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'kas_Arab-gom_Deva': {'num_samples': 1503, 'number_of_characters': 159379, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'kas_Arab-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'kas_Arab-hin_Deva': {'num_samples': 1503, 'number_of_characters': 163061, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'kas_Arab-kan_Knda': {'num_samples': 1503, 'number_of_characters': 168274, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'kas_Arab-mai_Deva': {'num_samples': 1503, 'number_of_characters': 165505, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'kas_Arab-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 175935, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'kas_Arab-mar_Deva': {'num_samples': 1503, 'number_of_characters': 165844, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'kas_Arab-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 160413, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'kas_Arab-npi_Deva': {'num_samples': 1503, 'number_of_characters': 164003, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'kas_Arab-ory_Orya': {'num_samples': 1503, 'number_of_characters': 167320, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'kas_Arab-pan_Guru': {'num_samples': 1503, 'number_of_characters': 163298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'kas_Arab-san_Deva': {'num_samples': 1503, 'number_of_characters': 161190, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'kas_Arab-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172476, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'kas_Arab-snd_Deva': {'num_samples': 1503, 'number_of_characters': 165720, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'kas_Arab-tam_Taml': {'num_samples': 1503, 'number_of_characters': 177963, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'kas_Arab-tel_Telu': {'num_samples': 1503, 'number_of_characters': 160787, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'kas_Arab-urd_Arab': {'num_samples': 1503, 'number_of_characters': 164402, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mai_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162408, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mai_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 156812, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mai_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 162868, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mai_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167856, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mai_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161540, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mai_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157106, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mai_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159093, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mai_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 160788, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mai_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166001, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mai_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165505, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mai_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173662, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mai_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163571, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mai_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158140, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mai_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mai_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165047, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mai_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161025, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mai_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 158917, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mai_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170203, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mai_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163447, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mai_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175690, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mai_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158514, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mai_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162129, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mal_Mlym-asm_Beng': {'num_samples': 1503, 'number_of_characters': 172838, 'unique_pairs': 1498, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mal_Mlym-ben_Beng': {'num_samples': 1503, 'number_of_characters': 167242, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mal_Mlym-brx_Deva': {'num_samples': 1503, 'number_of_characters': 173298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mal_Mlym-doi_Deva': {'num_samples': 1503, 'number_of_characters': 178286, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mal_Mlym-eng_Latn': {'num_samples': 1503, 'number_of_characters': 171970, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mal_Mlym-gom_Deva': {'num_samples': 1503, 'number_of_characters': 167536, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mal_Mlym-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 169523, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mal_Mlym-hin_Deva': {'num_samples': 1503, 'number_of_characters': 171218, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mal_Mlym-kan_Knda': {'num_samples': 1503, 'number_of_characters': 176431, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mal_Mlym-kas_Arab': {'num_samples': 1503, 'number_of_characters': 175935, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mal_Mlym-mai_Deva': {'num_samples': 1503, 'number_of_characters': 173662, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mal_Mlym-mar_Deva': {'num_samples': 1503, 'number_of_characters': 174001, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mal_Mlym-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 168570, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mal_Mlym-npi_Deva': {'num_samples': 1503, 'number_of_characters': 172160, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mal_Mlym-ory_Orya': {'num_samples': 1503, 'number_of_characters': 175477, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mal_Mlym-pan_Guru': {'num_samples': 1503, 'number_of_characters': 171455, 'unique_pairs': 1498, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mal_Mlym-san_Deva': {'num_samples': 1503, 'number_of_characters': 169347, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mal_Mlym-sat_Olck': {'num_samples': 1503, 'number_of_characters': 180633, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mal_Mlym-snd_Deva': {'num_samples': 1503, 'number_of_characters': 173877, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mal_Mlym-tam_Taml': {'num_samples': 1503, 'number_of_characters': 186120, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mal_Mlym-tel_Telu': {'num_samples': 1503, 'number_of_characters': 168944, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mal_Mlym-urd_Arab': {'num_samples': 1503, 'number_of_characters': 172559, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mar_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162747, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mar_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 157151, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mar_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 163207, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mar_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 168195, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mar_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161879, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mar_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157445, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mar_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159432, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mar_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 161127, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mar_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166340, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mar_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165844, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mar_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 163571, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mar_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 174001, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mar_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158479, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mar_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 162069, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mar_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165386, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mar_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161364, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mar_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 159256, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mar_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170542, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mar_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163786, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mar_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 176029, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mar_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158853, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mar_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162468, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mni_Mtei-asm_Beng': {'num_samples': 1503, 'number_of_characters': 157316, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mni_Mtei-ben_Beng': {'num_samples': 1503, 'number_of_characters': 151720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mni_Mtei-brx_Deva': {'num_samples': 1503, 'number_of_characters': 157776, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mni_Mtei-doi_Deva': {'num_samples': 1503, 'number_of_characters': 162764, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mni_Mtei-eng_Latn': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mni_Mtei-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152014, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mni_Mtei-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mni_Mtei-hin_Deva': {'num_samples': 1503, 'number_of_characters': 155696, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mni_Mtei-kan_Knda': {'num_samples': 1503, 'number_of_characters': 160909, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mni_Mtei-kas_Arab': {'num_samples': 1503, 'number_of_characters': 160413, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mni_Mtei-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158140, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mni_Mtei-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 168570, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mni_Mtei-mar_Deva': {'num_samples': 1503, 'number_of_characters': 158479, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mni_Mtei-npi_Deva': {'num_samples': 1503, 'number_of_characters': 156638, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mni_Mtei-ory_Orya': {'num_samples': 1503, 'number_of_characters': 159955, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mni_Mtei-pan_Guru': {'num_samples': 1503, 'number_of_characters': 155933, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mni_Mtei-san_Deva': {'num_samples': 1503, 'number_of_characters': 153825, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mni_Mtei-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165111, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mni_Mtei-snd_Deva': {'num_samples': 1503, 'number_of_characters': 158355, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mni_Mtei-tam_Taml': {'num_samples': 1503, 'number_of_characters': 170598, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mni_Mtei-tel_Telu': {'num_samples': 1503, 'number_of_characters': 153422, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mni_Mtei-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157037, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'npi_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160906, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'npi_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155310, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'npi_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'npi_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166354, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'npi_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160038, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'npi_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 155604, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'npi_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157591, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'npi_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159286, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'npi_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164499, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'npi_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'npi_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'npi_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172160, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'npi_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162069, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'npi_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 156638, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'npi_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163545, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'npi_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159523, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'npi_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 157415, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'npi_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 168701, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'npi_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161945, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'npi_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174188, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'npi_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157012, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'npi_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 160627, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'ory_Orya-asm_Beng': {'num_samples': 1503, 'number_of_characters': 164223, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'ory_Orya-ben_Beng': {'num_samples': 1503, 'number_of_characters': 158627, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'ory_Orya-brx_Deva': {'num_samples': 1503, 'number_of_characters': 164683, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'ory_Orya-doi_Deva': {'num_samples': 1503, 'number_of_characters': 169671, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'ory_Orya-eng_Latn': {'num_samples': 1503, 'number_of_characters': 163355, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'ory_Orya-gom_Deva': {'num_samples': 1503, 'number_of_characters': 158921, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'ory_Orya-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 160908, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'ory_Orya-hin_Deva': {'num_samples': 1503, 'number_of_characters': 162603, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'ory_Orya-kan_Knda': {'num_samples': 1503, 'number_of_characters': 167816, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'ory_Orya-kas_Arab': {'num_samples': 1503, 'number_of_characters': 167320, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'ory_Orya-mai_Deva': {'num_samples': 1503, 'number_of_characters': 165047, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'ory_Orya-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 175477, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'ory_Orya-mar_Deva': {'num_samples': 1503, 'number_of_characters': 165386, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'ory_Orya-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 159955, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'ory_Orya-npi_Deva': {'num_samples': 1503, 'number_of_characters': 163545, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'ory_Orya-pan_Guru': {'num_samples': 1503, 'number_of_characters': 162840, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'ory_Orya-san_Deva': {'num_samples': 1503, 'number_of_characters': 160732, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'ory_Orya-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172018, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'ory_Orya-snd_Deva': {'num_samples': 1503, 'number_of_characters': 165262, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'ory_Orya-tam_Taml': {'num_samples': 1503, 'number_of_characters': 177505, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'ory_Orya-tel_Telu': {'num_samples': 1503, 'number_of_characters': 160329, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'ory_Orya-urd_Arab': {'num_samples': 1503, 'number_of_characters': 163944, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'pan_Guru-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160201, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'pan_Guru-ben_Beng': {'num_samples': 1503, 'number_of_characters': 154605, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'pan_Guru-brx_Deva': {'num_samples': 1503, 'number_of_characters': 160661, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'pan_Guru-doi_Deva': {'num_samples': 1503, 'number_of_characters': 165649, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'pan_Guru-eng_Latn': {'num_samples': 1503, 'number_of_characters': 159333, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'pan_Guru-gom_Deva': {'num_samples': 1503, 'number_of_characters': 154899, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'pan_Guru-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 156886, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'pan_Guru-hin_Deva': {'num_samples': 1503, 'number_of_characters': 158581, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'pan_Guru-kan_Knda': {'num_samples': 1503, 'number_of_characters': 163794, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'pan_Guru-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163298, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'pan_Guru-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161025, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'pan_Guru-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171455, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'pan_Guru-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161364, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'pan_Guru-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 155933, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'pan_Guru-npi_Deva': {'num_samples': 1503, 'number_of_characters': 159523, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'pan_Guru-ory_Orya': {'num_samples': 1503, 'number_of_characters': 162840, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'pan_Guru-san_Deva': {'num_samples': 1503, 'number_of_characters': 156710, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'pan_Guru-sat_Olck': {'num_samples': 1503, 'number_of_characters': 167996, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'pan_Guru-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161240, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'pan_Guru-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173483, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'pan_Guru-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156307, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'pan_Guru-urd_Arab': {'num_samples': 1503, 'number_of_characters': 159922, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'san_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 158093, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'san_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152497, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'san_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158553, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'san_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163541, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'san_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 157225, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'san_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152791, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'san_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154778, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'san_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156473, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'san_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161686, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'san_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 161190, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'san_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158917, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'san_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 169347, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'san_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 159256, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'san_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 153825, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'san_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157415, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'san_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160732, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'san_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156710, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'san_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165888, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'san_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 159132, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'san_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 171375, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'san_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 154199, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'san_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157814, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'sat_Olck-asm_Beng': {'num_samples': 1503, 'number_of_characters': 169379, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'sat_Olck-ben_Beng': {'num_samples': 1503, 'number_of_characters': 163783, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'sat_Olck-brx_Deva': {'num_samples': 1503, 'number_of_characters': 169839, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'sat_Olck-doi_Deva': {'num_samples': 1503, 'number_of_characters': 174827, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'sat_Olck-eng_Latn': {'num_samples': 1503, 'number_of_characters': 168511, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'sat_Olck-gom_Deva': {'num_samples': 1503, 'number_of_characters': 164077, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'sat_Olck-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 166064, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'sat_Olck-hin_Deva': {'num_samples': 1503, 'number_of_characters': 167759, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'sat_Olck-kan_Knda': {'num_samples': 1503, 'number_of_characters': 172972, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'sat_Olck-kas_Arab': {'num_samples': 1503, 'number_of_characters': 172476, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'sat_Olck-mai_Deva': {'num_samples': 1503, 'number_of_characters': 170203, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'sat_Olck-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 180633, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'sat_Olck-mar_Deva': {'num_samples': 1503, 'number_of_characters': 170542, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'sat_Olck-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 165111, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'sat_Olck-npi_Deva': {'num_samples': 1503, 'number_of_characters': 168701, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'sat_Olck-ory_Orya': {'num_samples': 1503, 'number_of_characters': 172018, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'sat_Olck-pan_Guru': {'num_samples': 1503, 'number_of_characters': 167996, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'sat_Olck-san_Deva': {'num_samples': 1503, 'number_of_characters': 165888, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'sat_Olck-snd_Deva': {'num_samples': 1503, 'number_of_characters': 170418, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'sat_Olck-tam_Taml': {'num_samples': 1503, 'number_of_characters': 182661, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'sat_Olck-tel_Telu': {'num_samples': 1503, 'number_of_characters': 165485, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'sat_Olck-urd_Arab': {'num_samples': 1503, 'number_of_characters': 169100, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'snd_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162623, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'snd_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 157027, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'snd_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 163083, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'snd_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 168071, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'snd_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161755, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'snd_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157321, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'snd_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159308, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'snd_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 161003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'snd_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166216, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'snd_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'snd_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 163447, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'snd_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173877, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'snd_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163786, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'snd_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158355, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'snd_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161945, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'snd_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165262, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'snd_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161240, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'snd_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 159132, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'snd_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170418, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'snd_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175905, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'snd_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'snd_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162344, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'tam_Taml-asm_Beng': {'num_samples': 1503, 'number_of_characters': 174866, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'tam_Taml-ben_Beng': {'num_samples': 1503, 'number_of_characters': 169270, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'tam_Taml-brx_Deva': {'num_samples': 1503, 'number_of_characters': 175326, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'tam_Taml-doi_Deva': {'num_samples': 1503, 'number_of_characters': 180314, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'tam_Taml-eng_Latn': {'num_samples': 1503, 'number_of_characters': 173998, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'tam_Taml-gom_Deva': {'num_samples': 1503, 'number_of_characters': 169564, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'tam_Taml-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 171551, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'tam_Taml-hin_Deva': {'num_samples': 1503, 'number_of_characters': 173246, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'tam_Taml-kan_Knda': {'num_samples': 1503, 'number_of_characters': 178459, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'tam_Taml-kas_Arab': {'num_samples': 1503, 'number_of_characters': 177963, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'tam_Taml-mai_Deva': {'num_samples': 1503, 'number_of_characters': 175690, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'tam_Taml-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 186120, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'tam_Taml-mar_Deva': {'num_samples': 1503, 'number_of_characters': 176029, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'tam_Taml-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 170598, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'tam_Taml-npi_Deva': {'num_samples': 1503, 'number_of_characters': 174188, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'tam_Taml-ory_Orya': {'num_samples': 1503, 'number_of_characters': 177505, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'tam_Taml-pan_Guru': {'num_samples': 1503, 'number_of_characters': 173483, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'tam_Taml-san_Deva': {'num_samples': 1503, 'number_of_characters': 171375, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'tam_Taml-sat_Olck': {'num_samples': 1503, 'number_of_characters': 182661, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'tam_Taml-snd_Deva': {'num_samples': 1503, 'number_of_characters': 175905, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'tam_Taml-tel_Telu': {'num_samples': 1503, 'number_of_characters': 170972, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'tam_Taml-urd_Arab': {'num_samples': 1503, 'number_of_characters': 174587, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'tel_Telu-asm_Beng': {'num_samples': 1503, 'number_of_characters': 157690, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'tel_Telu-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152094, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'tel_Telu-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158150, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'tel_Telu-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163138, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'tel_Telu-eng_Latn': {'num_samples': 1503, 'number_of_characters': 156822, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'tel_Telu-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152388, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'tel_Telu-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154375, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'tel_Telu-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156070, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'tel_Telu-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161283, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'tel_Telu-kas_Arab': {'num_samples': 1503, 'number_of_characters': 160787, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'tel_Telu-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158514, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'tel_Telu-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 168944, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'tel_Telu-mar_Deva': {'num_samples': 1503, 'number_of_characters': 158853, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'tel_Telu-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 153422, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'tel_Telu-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157012, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'tel_Telu-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160329, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'tel_Telu-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156307, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'tel_Telu-san_Deva': {'num_samples': 1503, 'number_of_characters': 154199, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'tel_Telu-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165485, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'tel_Telu-snd_Deva': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'tel_Telu-tam_Taml': {'num_samples': 1503, 'number_of_characters': 170972, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'tel_Telu-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157411, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'urd_Arab-asm_Beng': {'num_samples': 1503, 'number_of_characters': 161305, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'urd_Arab-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155709, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'urd_Arab-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161765, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'urd_Arab-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166753, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'urd_Arab-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160437, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'urd_Arab-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'urd_Arab-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157990, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'urd_Arab-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159685, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'urd_Arab-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164898, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'urd_Arab-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164402, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'urd_Arab-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162129, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'urd_Arab-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172559, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'urd_Arab-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162468, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'urd_Arab-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157037, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'urd_Arab-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160627, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'urd_Arab-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163944, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'urd_Arab-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159922, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'urd_Arab-san_Deva': {'num_samples': 1503, 'number_of_characters': 157814, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'urd_Arab-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169100, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'urd_Arab-snd_Deva': {'num_samples': 1503, 'number_of_characters': 162344, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'urd_Arab-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174587, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'urd_Arab-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157411, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}}}} | -| [IN22GenBitextMining](https://huggingface.co/datasets/ai4bharat/IN22-Gen) (Jay Gala, 2023) | ['asm', 'ben', 'brx', 'doi', 'eng', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | BitextMining | s2s | [Web, Legal, Government, News, Religious, Non-fiction, Written] | {'test': 518144} | {'test': {'num_samples': 518144, 'number_of_characters': 162367876, 'unique_pairs': 518101, 'min_sentence1_length': 9, 'average_sentence1_length': 156.68, 'max_sentence1_length': 692, 'unique_sentence1': 23550, 'min_sentence2_length': 9, 'average_sentence2_length': 156.68, 'max_sentence2_length': 692, 'unique_sentence2': 23550, 'hf_subset_descriptive_stats': {'asm_Beng-ben_Beng': {'num_samples': 1024, 'number_of_characters': 310622, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'asm_Beng-brx_Deva': {'num_samples': 1024, 'number_of_characters': 323609, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'asm_Beng-doi_Deva': {'num_samples': 1024, 'number_of_characters': 319020, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'asm_Beng-eng_Latn': {'num_samples': 1024, 'number_of_characters': 320098, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'asm_Beng-gom_Deva': {'num_samples': 1024, 'number_of_characters': 312594, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'asm_Beng-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 309440, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'asm_Beng-hin_Deva': {'num_samples': 1024, 'number_of_characters': 320106, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'asm_Beng-kan_Knda': {'num_samples': 1024, 'number_of_characters': 332064, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'asm_Beng-kas_Arab': {'num_samples': 1024, 'number_of_characters': 322764, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'asm_Beng-mai_Deva': {'num_samples': 1024, 'number_of_characters': 308682, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'asm_Beng-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 343636, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'asm_Beng-mar_Deva': {'num_samples': 1024, 'number_of_characters': 321784, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'asm_Beng-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 313134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'asm_Beng-npi_Deva': {'num_samples': 1024, 'number_of_characters': 313419, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'asm_Beng-ory_Orya': {'num_samples': 1024, 'number_of_characters': 334226, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'asm_Beng-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306863, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'asm_Beng-san_Deva': {'num_samples': 1024, 'number_of_characters': 318079, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'asm_Beng-sat_Olck': {'num_samples': 1024, 'number_of_characters': 326732, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'asm_Beng-snd_Deva': {'num_samples': 1024, 'number_of_characters': 320421, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'asm_Beng-tam_Taml': {'num_samples': 1024, 'number_of_characters': 348346, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'asm_Beng-tel_Telu': {'num_samples': 1024, 'number_of_characters': 319045, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'asm_Beng-urd_Arab': {'num_samples': 1024, 'number_of_characters': 315134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'ben_Beng-asm_Beng': {'num_samples': 1024, 'number_of_characters': 310622, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'ben_Beng-brx_Deva': {'num_samples': 1024, 'number_of_characters': 313313, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'ben_Beng-doi_Deva': {'num_samples': 1024, 'number_of_characters': 308724, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'ben_Beng-eng_Latn': {'num_samples': 1024, 'number_of_characters': 309802, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'ben_Beng-gom_Deva': {'num_samples': 1024, 'number_of_characters': 302298, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'ben_Beng-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 299144, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'ben_Beng-hin_Deva': {'num_samples': 1024, 'number_of_characters': 309810, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'ben_Beng-kan_Knda': {'num_samples': 1024, 'number_of_characters': 321768, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'ben_Beng-kas_Arab': {'num_samples': 1024, 'number_of_characters': 312468, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'ben_Beng-mai_Deva': {'num_samples': 1024, 'number_of_characters': 298386, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'ben_Beng-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 333340, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'ben_Beng-mar_Deva': {'num_samples': 1024, 'number_of_characters': 311488, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'ben_Beng-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 302838, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'ben_Beng-npi_Deva': {'num_samples': 1024, 'number_of_characters': 303123, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'ben_Beng-ory_Orya': {'num_samples': 1024, 'number_of_characters': 323930, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'ben_Beng-pan_Guru': {'num_samples': 1024, 'number_of_characters': 296567, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'ben_Beng-san_Deva': {'num_samples': 1024, 'number_of_characters': 307783, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'ben_Beng-sat_Olck': {'num_samples': 1024, 'number_of_characters': 316436, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'ben_Beng-snd_Deva': {'num_samples': 1024, 'number_of_characters': 310125, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'ben_Beng-tam_Taml': {'num_samples': 1024, 'number_of_characters': 338050, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'ben_Beng-tel_Telu': {'num_samples': 1024, 'number_of_characters': 308749, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'ben_Beng-urd_Arab': {'num_samples': 1024, 'number_of_characters': 304838, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'brx_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 323609, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'brx_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 313313, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'brx_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 321711, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'brx_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 322789, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'brx_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 315285, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'brx_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 312131, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'brx_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 322797, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'brx_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 334755, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'brx_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 325455, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'brx_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 311373, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'brx_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 346327, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'brx_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 324475, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'brx_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 315825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'brx_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 316110, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'brx_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 336917, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'brx_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 309554, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'brx_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 320770, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'brx_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 329423, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'brx_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 323112, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'brx_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 351037, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'brx_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 321736, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'brx_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 317825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'doi_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 319020, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'doi_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 308724, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'doi_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 321711, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'doi_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 318200, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'doi_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 310696, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'doi_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 307542, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'doi_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 318208, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'doi_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 330166, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'doi_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 320866, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'doi_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 306784, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'doi_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 341738, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'doi_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 319886, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'doi_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 311236, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'doi_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 311521, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'doi_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 332328, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'doi_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304965, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'doi_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 316181, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'doi_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 324834, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'doi_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 318523, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'doi_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 346448, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'doi_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 317147, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'doi_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 313236, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'eng_Latn-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320098, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'eng_Latn-ben_Beng': {'num_samples': 1024, 'number_of_characters': 309802, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'eng_Latn-brx_Deva': {'num_samples': 1024, 'number_of_characters': 322789, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'eng_Latn-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318200, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'eng_Latn-gom_Deva': {'num_samples': 1024, 'number_of_characters': 311774, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'eng_Latn-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308620, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'eng_Latn-hin_Deva': {'num_samples': 1024, 'number_of_characters': 319286, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'eng_Latn-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331244, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'eng_Latn-kas_Arab': {'num_samples': 1024, 'number_of_characters': 321944, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'eng_Latn-mai_Deva': {'num_samples': 1024, 'number_of_characters': 307862, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'eng_Latn-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 342816, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'eng_Latn-mar_Deva': {'num_samples': 1024, 'number_of_characters': 320964, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'eng_Latn-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312314, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'eng_Latn-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312599, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'eng_Latn-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333406, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'eng_Latn-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306043, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'eng_Latn-san_Deva': {'num_samples': 1024, 'number_of_characters': 317259, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'eng_Latn-sat_Olck': {'num_samples': 1024, 'number_of_characters': 325912, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'eng_Latn-snd_Deva': {'num_samples': 1024, 'number_of_characters': 319601, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'eng_Latn-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347526, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'eng_Latn-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318225, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'eng_Latn-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314314, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'gom_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 312594, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'gom_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 302298, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'gom_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 315285, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'gom_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 310696, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'gom_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 311774, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'gom_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301116, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'gom_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 311782, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'gom_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 323740, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'gom_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 314440, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'gom_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 300358, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'gom_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 335312, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'gom_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 313460, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'gom_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 304810, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'gom_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 305095, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'gom_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 325902, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'gom_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 298539, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'gom_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 309755, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'gom_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 318408, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'gom_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312097, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'gom_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340022, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'gom_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 310721, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'gom_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 306810, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'guj_Gujr-asm_Beng': {'num_samples': 1024, 'number_of_characters': 309440, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'guj_Gujr-ben_Beng': {'num_samples': 1024, 'number_of_characters': 299144, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'guj_Gujr-brx_Deva': {'num_samples': 1024, 'number_of_characters': 312131, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'guj_Gujr-doi_Deva': {'num_samples': 1024, 'number_of_characters': 307542, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'guj_Gujr-eng_Latn': {'num_samples': 1024, 'number_of_characters': 308620, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'guj_Gujr-gom_Deva': {'num_samples': 1024, 'number_of_characters': 301116, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'guj_Gujr-hin_Deva': {'num_samples': 1024, 'number_of_characters': 308628, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'guj_Gujr-kan_Knda': {'num_samples': 1024, 'number_of_characters': 320586, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'guj_Gujr-kas_Arab': {'num_samples': 1024, 'number_of_characters': 311286, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'guj_Gujr-mai_Deva': {'num_samples': 1024, 'number_of_characters': 297204, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'guj_Gujr-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 332158, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'guj_Gujr-mar_Deva': {'num_samples': 1024, 'number_of_characters': 310306, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'guj_Gujr-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 301656, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'guj_Gujr-npi_Deva': {'num_samples': 1024, 'number_of_characters': 301941, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'guj_Gujr-ory_Orya': {'num_samples': 1024, 'number_of_characters': 322748, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'guj_Gujr-pan_Guru': {'num_samples': 1024, 'number_of_characters': 295385, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'guj_Gujr-san_Deva': {'num_samples': 1024, 'number_of_characters': 306601, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'guj_Gujr-sat_Olck': {'num_samples': 1024, 'number_of_characters': 315254, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'guj_Gujr-snd_Deva': {'num_samples': 1024, 'number_of_characters': 308943, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'guj_Gujr-tam_Taml': {'num_samples': 1024, 'number_of_characters': 336868, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'guj_Gujr-tel_Telu': {'num_samples': 1024, 'number_of_characters': 307567, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'guj_Gujr-urd_Arab': {'num_samples': 1024, 'number_of_characters': 303656, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'hin_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320106, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'hin_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 309810, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'hin_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 322797, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'hin_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318208, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'hin_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 319286, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'hin_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 311782, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'hin_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308628, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'hin_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331252, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'hin_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 321952, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'hin_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 307870, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'hin_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 342824, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'hin_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 320972, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'hin_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312322, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'hin_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312607, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'hin_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333414, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'hin_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306051, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'hin_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 317267, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'hin_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 325920, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'hin_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 319609, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'hin_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347534, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'hin_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318233, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'hin_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314322, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'kan_Knda-asm_Beng': {'num_samples': 1024, 'number_of_characters': 332064, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'kan_Knda-ben_Beng': {'num_samples': 1024, 'number_of_characters': 321768, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'kan_Knda-brx_Deva': {'num_samples': 1024, 'number_of_characters': 334755, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'kan_Knda-doi_Deva': {'num_samples': 1024, 'number_of_characters': 330166, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'kan_Knda-eng_Latn': {'num_samples': 1024, 'number_of_characters': 331244, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'kan_Knda-gom_Deva': {'num_samples': 1024, 'number_of_characters': 323740, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'kan_Knda-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 320586, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'kan_Knda-hin_Deva': {'num_samples': 1024, 'number_of_characters': 331252, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'kan_Knda-kas_Arab': {'num_samples': 1024, 'number_of_characters': 333910, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'kan_Knda-mai_Deva': {'num_samples': 1024, 'number_of_characters': 319828, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'kan_Knda-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 354782, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'kan_Knda-mar_Deva': {'num_samples': 1024, 'number_of_characters': 332930, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'kan_Knda-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 324280, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'kan_Knda-npi_Deva': {'num_samples': 1024, 'number_of_characters': 324565, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'kan_Knda-ory_Orya': {'num_samples': 1024, 'number_of_characters': 345372, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'kan_Knda-pan_Guru': {'num_samples': 1024, 'number_of_characters': 318009, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'kan_Knda-san_Deva': {'num_samples': 1024, 'number_of_characters': 329225, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'kan_Knda-sat_Olck': {'num_samples': 1024, 'number_of_characters': 337878, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'kan_Knda-snd_Deva': {'num_samples': 1024, 'number_of_characters': 331567, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'kan_Knda-tam_Taml': {'num_samples': 1024, 'number_of_characters': 359492, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'kan_Knda-tel_Telu': {'num_samples': 1024, 'number_of_characters': 330191, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'kan_Knda-urd_Arab': {'num_samples': 1024, 'number_of_characters': 326280, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'kas_Arab-asm_Beng': {'num_samples': 1024, 'number_of_characters': 322764, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'kas_Arab-ben_Beng': {'num_samples': 1024, 'number_of_characters': 312468, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'kas_Arab-brx_Deva': {'num_samples': 1024, 'number_of_characters': 325455, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'kas_Arab-doi_Deva': {'num_samples': 1024, 'number_of_characters': 320866, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'kas_Arab-eng_Latn': {'num_samples': 1024, 'number_of_characters': 321944, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'kas_Arab-gom_Deva': {'num_samples': 1024, 'number_of_characters': 314440, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'kas_Arab-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 311286, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'kas_Arab-hin_Deva': {'num_samples': 1024, 'number_of_characters': 321952, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'kas_Arab-kan_Knda': {'num_samples': 1024, 'number_of_characters': 333910, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'kas_Arab-mai_Deva': {'num_samples': 1024, 'number_of_characters': 310528, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'kas_Arab-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 345482, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'kas_Arab-mar_Deva': {'num_samples': 1024, 'number_of_characters': 323630, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'kas_Arab-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 314980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'kas_Arab-npi_Deva': {'num_samples': 1024, 'number_of_characters': 315265, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'kas_Arab-ory_Orya': {'num_samples': 1024, 'number_of_characters': 336072, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'kas_Arab-pan_Guru': {'num_samples': 1024, 'number_of_characters': 308709, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'kas_Arab-san_Deva': {'num_samples': 1024, 'number_of_characters': 319925, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'kas_Arab-sat_Olck': {'num_samples': 1024, 'number_of_characters': 328578, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'kas_Arab-snd_Deva': {'num_samples': 1024, 'number_of_characters': 322267, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'kas_Arab-tam_Taml': {'num_samples': 1024, 'number_of_characters': 350192, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'kas_Arab-tel_Telu': {'num_samples': 1024, 'number_of_characters': 320891, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'kas_Arab-urd_Arab': {'num_samples': 1024, 'number_of_characters': 316980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mai_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 308682, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mai_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 298386, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mai_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 311373, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mai_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 306784, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mai_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 307862, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mai_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 300358, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mai_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 297204, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mai_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 307870, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mai_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 319828, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mai_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 310528, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mai_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 331400, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mai_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 309548, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mai_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 300898, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mai_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 301183, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mai_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 321990, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mai_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 294627, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mai_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 305843, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mai_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 314496, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mai_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 308185, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mai_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 336110, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mai_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 306809, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mai_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 302898, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mal_Mlym-asm_Beng': {'num_samples': 1024, 'number_of_characters': 343636, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mal_Mlym-ben_Beng': {'num_samples': 1024, 'number_of_characters': 333340, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mal_Mlym-brx_Deva': {'num_samples': 1024, 'number_of_characters': 346327, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mal_Mlym-doi_Deva': {'num_samples': 1024, 'number_of_characters': 341738, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mal_Mlym-eng_Latn': {'num_samples': 1024, 'number_of_characters': 342816, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mal_Mlym-gom_Deva': {'num_samples': 1024, 'number_of_characters': 335312, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mal_Mlym-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 332158, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mal_Mlym-hin_Deva': {'num_samples': 1024, 'number_of_characters': 342824, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mal_Mlym-kan_Knda': {'num_samples': 1024, 'number_of_characters': 354782, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mal_Mlym-kas_Arab': {'num_samples': 1024, 'number_of_characters': 345482, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mal_Mlym-mai_Deva': {'num_samples': 1024, 'number_of_characters': 331400, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mal_Mlym-mar_Deva': {'num_samples': 1024, 'number_of_characters': 344502, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mal_Mlym-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 335852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mal_Mlym-npi_Deva': {'num_samples': 1024, 'number_of_characters': 336137, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mal_Mlym-ory_Orya': {'num_samples': 1024, 'number_of_characters': 356944, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mal_Mlym-pan_Guru': {'num_samples': 1024, 'number_of_characters': 329581, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mal_Mlym-san_Deva': {'num_samples': 1024, 'number_of_characters': 340797, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mal_Mlym-sat_Olck': {'num_samples': 1024, 'number_of_characters': 349450, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mal_Mlym-snd_Deva': {'num_samples': 1024, 'number_of_characters': 343139, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mal_Mlym-tam_Taml': {'num_samples': 1024, 'number_of_characters': 371064, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mal_Mlym-tel_Telu': {'num_samples': 1024, 'number_of_characters': 341763, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mal_Mlym-urd_Arab': {'num_samples': 1024, 'number_of_characters': 337852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mar_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 321784, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mar_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 311488, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mar_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 324475, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mar_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 319886, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mar_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 320964, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mar_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 313460, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mar_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 310306, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mar_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 320972, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mar_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 332930, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mar_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 323630, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mar_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 309548, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mar_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 344502, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mar_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 314000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mar_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 314285, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mar_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 335092, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mar_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 307729, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mar_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 318945, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mar_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 327598, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mar_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 321287, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mar_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 349212, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mar_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 319911, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mar_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 316000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mni_Mtei-asm_Beng': {'num_samples': 1024, 'number_of_characters': 313134, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mni_Mtei-ben_Beng': {'num_samples': 1024, 'number_of_characters': 302838, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mni_Mtei-brx_Deva': {'num_samples': 1024, 'number_of_characters': 315825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mni_Mtei-doi_Deva': {'num_samples': 1024, 'number_of_characters': 311236, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mni_Mtei-eng_Latn': {'num_samples': 1024, 'number_of_characters': 312314, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mni_Mtei-gom_Deva': {'num_samples': 1024, 'number_of_characters': 304810, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mni_Mtei-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301656, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mni_Mtei-hin_Deva': {'num_samples': 1024, 'number_of_characters': 312322, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mni_Mtei-kan_Knda': {'num_samples': 1024, 'number_of_characters': 324280, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mni_Mtei-kas_Arab': {'num_samples': 1024, 'number_of_characters': 314980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mni_Mtei-mai_Deva': {'num_samples': 1024, 'number_of_characters': 300898, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mni_Mtei-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 335852, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mni_Mtei-mar_Deva': {'num_samples': 1024, 'number_of_characters': 314000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mni_Mtei-npi_Deva': {'num_samples': 1024, 'number_of_characters': 305635, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mni_Mtei-ory_Orya': {'num_samples': 1024, 'number_of_characters': 326442, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mni_Mtei-pan_Guru': {'num_samples': 1024, 'number_of_characters': 299079, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mni_Mtei-san_Deva': {'num_samples': 1024, 'number_of_characters': 310295, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mni_Mtei-sat_Olck': {'num_samples': 1024, 'number_of_characters': 318948, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mni_Mtei-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312637, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mni_Mtei-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340562, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mni_Mtei-tel_Telu': {'num_samples': 1024, 'number_of_characters': 311261, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mni_Mtei-urd_Arab': {'num_samples': 1024, 'number_of_characters': 307350, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'npi_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 313419, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'npi_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 303123, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'npi_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 316110, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'npi_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 311521, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'npi_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 312599, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'npi_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 305095, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'npi_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301941, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'npi_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 312607, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'npi_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 324565, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'npi_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 315265, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'npi_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 301183, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'npi_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 336137, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'npi_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 314285, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'npi_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 305635, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'npi_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 326727, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'npi_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 299364, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'npi_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 310580, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'npi_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 319233, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'npi_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312922, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'npi_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340847, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'npi_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 311546, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'npi_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 307635, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'ory_Orya-asm_Beng': {'num_samples': 1024, 'number_of_characters': 334226, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'ory_Orya-ben_Beng': {'num_samples': 1024, 'number_of_characters': 323930, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'ory_Orya-brx_Deva': {'num_samples': 1024, 'number_of_characters': 336917, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'ory_Orya-doi_Deva': {'num_samples': 1024, 'number_of_characters': 332328, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'ory_Orya-eng_Latn': {'num_samples': 1024, 'number_of_characters': 333406, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'ory_Orya-gom_Deva': {'num_samples': 1024, 'number_of_characters': 325902, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'ory_Orya-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 322748, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'ory_Orya-hin_Deva': {'num_samples': 1024, 'number_of_characters': 333414, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'ory_Orya-kan_Knda': {'num_samples': 1024, 'number_of_characters': 345372, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'ory_Orya-kas_Arab': {'num_samples': 1024, 'number_of_characters': 336072, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'ory_Orya-mai_Deva': {'num_samples': 1024, 'number_of_characters': 321990, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'ory_Orya-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 356944, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'ory_Orya-mar_Deva': {'num_samples': 1024, 'number_of_characters': 335092, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'ory_Orya-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 326442, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'ory_Orya-npi_Deva': {'num_samples': 1024, 'number_of_characters': 326727, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'ory_Orya-pan_Guru': {'num_samples': 1024, 'number_of_characters': 320171, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'ory_Orya-san_Deva': {'num_samples': 1024, 'number_of_characters': 331387, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'ory_Orya-sat_Olck': {'num_samples': 1024, 'number_of_characters': 340040, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'ory_Orya-snd_Deva': {'num_samples': 1024, 'number_of_characters': 333729, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'ory_Orya-tam_Taml': {'num_samples': 1024, 'number_of_characters': 361654, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'ory_Orya-tel_Telu': {'num_samples': 1024, 'number_of_characters': 332353, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'ory_Orya-urd_Arab': {'num_samples': 1024, 'number_of_characters': 328442, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'pan_Guru-asm_Beng': {'num_samples': 1024, 'number_of_characters': 306863, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'pan_Guru-ben_Beng': {'num_samples': 1024, 'number_of_characters': 296567, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'pan_Guru-brx_Deva': {'num_samples': 1024, 'number_of_characters': 309554, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'pan_Guru-doi_Deva': {'num_samples': 1024, 'number_of_characters': 304965, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'pan_Guru-eng_Latn': {'num_samples': 1024, 'number_of_characters': 306043, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'pan_Guru-gom_Deva': {'num_samples': 1024, 'number_of_characters': 298539, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'pan_Guru-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 295385, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'pan_Guru-hin_Deva': {'num_samples': 1024, 'number_of_characters': 306051, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'pan_Guru-kan_Knda': {'num_samples': 1024, 'number_of_characters': 318009, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'pan_Guru-kas_Arab': {'num_samples': 1024, 'number_of_characters': 308709, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'pan_Guru-mai_Deva': {'num_samples': 1024, 'number_of_characters': 294627, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'pan_Guru-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 329581, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'pan_Guru-mar_Deva': {'num_samples': 1024, 'number_of_characters': 307729, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'pan_Guru-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 299079, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'pan_Guru-npi_Deva': {'num_samples': 1024, 'number_of_characters': 299364, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'pan_Guru-ory_Orya': {'num_samples': 1024, 'number_of_characters': 320171, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'pan_Guru-san_Deva': {'num_samples': 1024, 'number_of_characters': 304024, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'pan_Guru-sat_Olck': {'num_samples': 1024, 'number_of_characters': 312677, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'pan_Guru-snd_Deva': {'num_samples': 1024, 'number_of_characters': 306366, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'pan_Guru-tam_Taml': {'num_samples': 1024, 'number_of_characters': 334291, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'pan_Guru-tel_Telu': {'num_samples': 1024, 'number_of_characters': 304990, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'pan_Guru-urd_Arab': {'num_samples': 1024, 'number_of_characters': 301079, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'san_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 318079, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'san_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 307783, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'san_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 320770, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'san_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 316181, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'san_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 317259, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'san_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 309755, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'san_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 306601, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'san_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 317267, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'san_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 329225, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'san_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 319925, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'san_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 305843, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'san_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 340797, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'san_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 318945, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'san_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 310295, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'san_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 310580, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'san_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 331387, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'san_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304024, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'san_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 323893, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'san_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 317582, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'san_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 345507, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'san_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 316206, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'san_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 312295, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'sat_Olck-asm_Beng': {'num_samples': 1024, 'number_of_characters': 326732, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'sat_Olck-ben_Beng': {'num_samples': 1024, 'number_of_characters': 316436, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'sat_Olck-brx_Deva': {'num_samples': 1024, 'number_of_characters': 329423, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'sat_Olck-doi_Deva': {'num_samples': 1024, 'number_of_characters': 324834, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'sat_Olck-eng_Latn': {'num_samples': 1024, 'number_of_characters': 325912, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'sat_Olck-gom_Deva': {'num_samples': 1024, 'number_of_characters': 318408, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'sat_Olck-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 315254, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'sat_Olck-hin_Deva': {'num_samples': 1024, 'number_of_characters': 325920, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'sat_Olck-kan_Knda': {'num_samples': 1024, 'number_of_characters': 337878, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'sat_Olck-kas_Arab': {'num_samples': 1024, 'number_of_characters': 328578, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'sat_Olck-mai_Deva': {'num_samples': 1024, 'number_of_characters': 314496, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'sat_Olck-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 349450, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'sat_Olck-mar_Deva': {'num_samples': 1024, 'number_of_characters': 327598, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'sat_Olck-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 318948, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'sat_Olck-npi_Deva': {'num_samples': 1024, 'number_of_characters': 319233, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'sat_Olck-ory_Orya': {'num_samples': 1024, 'number_of_characters': 340040, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'sat_Olck-pan_Guru': {'num_samples': 1024, 'number_of_characters': 312677, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'sat_Olck-san_Deva': {'num_samples': 1024, 'number_of_characters': 323893, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'sat_Olck-snd_Deva': {'num_samples': 1024, 'number_of_characters': 326235, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'sat_Olck-tam_Taml': {'num_samples': 1024, 'number_of_characters': 354160, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'sat_Olck-tel_Telu': {'num_samples': 1024, 'number_of_characters': 324859, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'sat_Olck-urd_Arab': {'num_samples': 1024, 'number_of_characters': 320948, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'snd_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320421, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'snd_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 310125, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'snd_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 323112, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'snd_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318523, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'snd_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 319601, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'snd_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 312097, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'snd_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308943, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'snd_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 319609, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'snd_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331567, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'snd_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 322267, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'snd_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 308185, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'snd_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 343139, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'snd_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 321287, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'snd_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312637, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'snd_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312922, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'snd_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333729, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'snd_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306366, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'snd_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 317582, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'snd_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 326235, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'snd_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347849, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'snd_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318548, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'snd_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314637, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'tam_Taml-asm_Beng': {'num_samples': 1024, 'number_of_characters': 348346, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'tam_Taml-ben_Beng': {'num_samples': 1024, 'number_of_characters': 338050, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'tam_Taml-brx_Deva': {'num_samples': 1024, 'number_of_characters': 351037, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'tam_Taml-doi_Deva': {'num_samples': 1024, 'number_of_characters': 346448, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'tam_Taml-eng_Latn': {'num_samples': 1024, 'number_of_characters': 347526, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'tam_Taml-gom_Deva': {'num_samples': 1024, 'number_of_characters': 340022, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'tam_Taml-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 336868, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'tam_Taml-hin_Deva': {'num_samples': 1024, 'number_of_characters': 347534, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'tam_Taml-kan_Knda': {'num_samples': 1024, 'number_of_characters': 359492, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'tam_Taml-kas_Arab': {'num_samples': 1024, 'number_of_characters': 350192, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'tam_Taml-mai_Deva': {'num_samples': 1024, 'number_of_characters': 336110, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'tam_Taml-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 371064, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'tam_Taml-mar_Deva': {'num_samples': 1024, 'number_of_characters': 349212, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'tam_Taml-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 340562, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'tam_Taml-npi_Deva': {'num_samples': 1024, 'number_of_characters': 340847, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'tam_Taml-ory_Orya': {'num_samples': 1024, 'number_of_characters': 361654, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'tam_Taml-pan_Guru': {'num_samples': 1024, 'number_of_characters': 334291, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'tam_Taml-san_Deva': {'num_samples': 1024, 'number_of_characters': 345507, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'tam_Taml-sat_Olck': {'num_samples': 1024, 'number_of_characters': 354160, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'tam_Taml-snd_Deva': {'num_samples': 1024, 'number_of_characters': 347849, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'tam_Taml-tel_Telu': {'num_samples': 1024, 'number_of_characters': 346473, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'tam_Taml-urd_Arab': {'num_samples': 1024, 'number_of_characters': 342562, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'tel_Telu-asm_Beng': {'num_samples': 1024, 'number_of_characters': 319045, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'tel_Telu-ben_Beng': {'num_samples': 1024, 'number_of_characters': 308749, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'tel_Telu-brx_Deva': {'num_samples': 1024, 'number_of_characters': 321736, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'tel_Telu-doi_Deva': {'num_samples': 1024, 'number_of_characters': 317147, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'tel_Telu-eng_Latn': {'num_samples': 1024, 'number_of_characters': 318225, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'tel_Telu-gom_Deva': {'num_samples': 1024, 'number_of_characters': 310721, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'tel_Telu-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 307567, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'tel_Telu-hin_Deva': {'num_samples': 1024, 'number_of_characters': 318233, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'tel_Telu-kan_Knda': {'num_samples': 1024, 'number_of_characters': 330191, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'tel_Telu-kas_Arab': {'num_samples': 1024, 'number_of_characters': 320891, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'tel_Telu-mai_Deva': {'num_samples': 1024, 'number_of_characters': 306809, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'tel_Telu-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 341763, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'tel_Telu-mar_Deva': {'num_samples': 1024, 'number_of_characters': 319911, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'tel_Telu-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 311261, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'tel_Telu-npi_Deva': {'num_samples': 1024, 'number_of_characters': 311546, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'tel_Telu-ory_Orya': {'num_samples': 1024, 'number_of_characters': 332353, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'tel_Telu-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304990, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'tel_Telu-san_Deva': {'num_samples': 1024, 'number_of_characters': 316206, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'tel_Telu-sat_Olck': {'num_samples': 1024, 'number_of_characters': 324859, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'tel_Telu-snd_Deva': {'num_samples': 1024, 'number_of_characters': 318548, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'tel_Telu-tam_Taml': {'num_samples': 1024, 'number_of_characters': 346473, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'tel_Telu-urd_Arab': {'num_samples': 1024, 'number_of_characters': 313261, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'urd_Arab-asm_Beng': {'num_samples': 1024, 'number_of_characters': 315134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'urd_Arab-ben_Beng': {'num_samples': 1024, 'number_of_characters': 304838, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'urd_Arab-brx_Deva': {'num_samples': 1024, 'number_of_characters': 317825, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'urd_Arab-doi_Deva': {'num_samples': 1024, 'number_of_characters': 313236, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'urd_Arab-eng_Latn': {'num_samples': 1024, 'number_of_characters': 314314, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'urd_Arab-gom_Deva': {'num_samples': 1024, 'number_of_characters': 306810, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'urd_Arab-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 303656, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'urd_Arab-hin_Deva': {'num_samples': 1024, 'number_of_characters': 314322, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'urd_Arab-kan_Knda': {'num_samples': 1024, 'number_of_characters': 326280, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'urd_Arab-kas_Arab': {'num_samples': 1024, 'number_of_characters': 316980, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'urd_Arab-mai_Deva': {'num_samples': 1024, 'number_of_characters': 302898, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'urd_Arab-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 337852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'urd_Arab-mar_Deva': {'num_samples': 1024, 'number_of_characters': 316000, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'urd_Arab-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 307350, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'urd_Arab-npi_Deva': {'num_samples': 1024, 'number_of_characters': 307635, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'urd_Arab-ory_Orya': {'num_samples': 1024, 'number_of_characters': 328442, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'urd_Arab-pan_Guru': {'num_samples': 1024, 'number_of_characters': 301079, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'urd_Arab-san_Deva': {'num_samples': 1024, 'number_of_characters': 312295, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'urd_Arab-sat_Olck': {'num_samples': 1024, 'number_of_characters': 320948, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'urd_Arab-snd_Deva': {'num_samples': 1024, 'number_of_characters': 314637, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'urd_Arab-tam_Taml': {'num_samples': 1024, 'number_of_characters': 342562, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'urd_Arab-tel_Telu': {'num_samples': 1024, 'number_of_characters': 313261, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}}}} | -| [IWSLT2017BitextMining](https://aclanthology.org/2017.iwslt-1.1/) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'jpn', 'kor', 'nld', 'ron'] | BitextMining | s2s | [Non-fiction, Fiction, Written] | {'validation': 21938} | {'validation': {'num_samples': 21938, 'number_of_characters': 4256244, 'unique_pairs': 21840, 'min_sentence1_length': 2, 'average_sentence1_length': 97.01, 'max_sentence1_length': 521, 'unique_sentence1': 11563, 'min_sentence2_length': 2, 'average_sentence2_length': 97.01, 'max_sentence2_length': 521, 'unique_sentence2': 11563, 'hf_subset_descriptive_stats': {'ar-en': {'num_samples': 888, 'number_of_characters': 172499, 'unique_pairs': 887, 'min_sentence1_length': 4, 'average_sentence1_length': 85.49, 'max_sentence1_length': 369, 'unique_sentence1': 887, 'min_sentence2_length': 10, 'average_sentence2_length': 108.77, 'max_sentence2_length': 462, 'unique_sentence2': 881}, 'de-en': {'num_samples': 888, 'number_of_characters': 202336, 'unique_pairs': 883, 'min_sentence1_length': 6, 'average_sentence1_length': 119.03, 'max_sentence1_length': 521, 'unique_sentence1': 881, 'min_sentence2_length': 10, 'average_sentence2_length': 108.83, 'max_sentence2_length': 462, 'unique_sentence2': 881}, 'en-ar': {'num_samples': 888, 'number_of_characters': 172499, 'unique_pairs': 887, 'min_sentence1_length': 10, 'average_sentence1_length': 108.77, 'max_sentence1_length': 462, 'unique_sentence1': 881, 'min_sentence2_length': 4, 'average_sentence2_length': 85.49, 'max_sentence2_length': 369, 'unique_sentence2': 887}, 'en-de': {'num_samples': 888, 'number_of_characters': 202336, 'unique_pairs': 883, 'min_sentence1_length': 10, 'average_sentence1_length': 108.83, 'max_sentence1_length': 462, 'unique_sentence1': 881, 'min_sentence2_length': 6, 'average_sentence2_length': 119.03, 'max_sentence2_length': 521, 'unique_sentence2': 881}, 'en-fr': {'num_samples': 890, 'number_of_characters': 197619, 'unique_pairs': 883, 'min_sentence1_length': 10, 'average_sentence1_length': 108.41, 'max_sentence1_length': 462, 'unique_sentence1': 883, 'min_sentence2_length': 6, 'average_sentence2_length': 113.63, 'max_sentence2_length': 493, 'unique_sentence2': 881}, 'en-it': {'num_samples': 929, 'number_of_characters': 191803, 'unique_pairs': 924, 'min_sentence1_length': 10, 'average_sentence1_length': 103.0, 'max_sentence1_length': 433, 'unique_sentence1': 922, 'min_sentence2_length': 7, 'average_sentence2_length': 103.46, 'max_sentence2_length': 444, 'unique_sentence2': 918}, 'en-ja': {'num_samples': 871, 'number_of_characters': 132742, 'unique_pairs': 867, 'min_sentence1_length': 10, 'average_sentence1_length': 109.81, 'max_sentence1_length': 462, 'unique_sentence1': 864, 'min_sentence2_length': 5, 'average_sentence2_length': 42.59, 'max_sentence2_length': 225, 'unique_sentence2': 866}, 'en-ko': {'num_samples': 879, 'number_of_characters': 142659, 'unique_pairs': 874, 'min_sentence1_length': 10, 'average_sentence1_length': 107.74, 'max_sentence1_length': 462, 'unique_sentence1': 872, 'min_sentence2_length': 3, 'average_sentence2_length': 54.56, 'max_sentence2_length': 250, 'unique_sentence2': 872}, 'en-nl': {'num_samples': 1003, 'number_of_characters': 189637, 'unique_pairs': 1000, 'min_sentence1_length': 10, 'average_sentence1_length': 95.27, 'max_sentence1_length': 433, 'unique_sentence1': 996, 'min_sentence2_length': 4, 'average_sentence2_length': 93.8, 'max_sentence2_length': 477, 'unique_sentence2': 1000}, 'en-ro': {'num_samples': 914, 'number_of_characters': 194128, 'unique_pairs': 910, 'min_sentence1_length': 10, 'average_sentence1_length': 104.72, 'max_sentence1_length': 433, 'unique_sentence1': 907, 'min_sentence2_length': 9, 'average_sentence2_length': 107.67, 'max_sentence2_length': 448, 'unique_sentence2': 910}, 'en-zh': {'num_samples': 879, 'number_of_characters': 131126, 'unique_pairs': 877, 'min_sentence1_length': 10, 'average_sentence1_length': 109.37, 'max_sentence1_length': 462, 'unique_sentence1': 872, 'min_sentence2_length': 2, 'average_sentence2_length': 39.81, 'max_sentence2_length': 230, 'unique_sentence2': 867}, 'fr-en': {'num_samples': 890, 'number_of_characters': 197619, 'unique_pairs': 883, 'min_sentence1_length': 6, 'average_sentence1_length': 113.63, 'max_sentence1_length': 493, 'unique_sentence1': 881, 'min_sentence2_length': 10, 'average_sentence2_length': 108.41, 'max_sentence2_length': 462, 'unique_sentence2': 883}, 'it-en': {'num_samples': 929, 'number_of_characters': 191803, 'unique_pairs': 924, 'min_sentence1_length': 7, 'average_sentence1_length': 103.46, 'max_sentence1_length': 444, 'unique_sentence1': 918, 'min_sentence2_length': 10, 'average_sentence2_length': 103.0, 'max_sentence2_length': 433, 'unique_sentence2': 922}, 'it-nl': {'num_samples': 1001, 'number_of_characters': 188858, 'unique_pairs': 998, 'min_sentence1_length': 7, 'average_sentence1_length': 94.64, 'max_sentence1_length': 459, 'unique_sentence1': 994, 'min_sentence2_length': 7, 'average_sentence2_length': 94.03, 'max_sentence2_length': 505, 'unique_sentence2': 998}, 'it-ro': {'num_samples': 914, 'number_of_characters': 193339, 'unique_pairs': 911, 'min_sentence1_length': 7, 'average_sentence1_length': 103.91, 'max_sentence1_length': 435, 'unique_sentence1': 907, 'min_sentence2_length': 9, 'average_sentence2_length': 107.62, 'max_sentence2_length': 448, 'unique_sentence2': 910}, 'ja-en': {'num_samples': 871, 'number_of_characters': 132742, 'unique_pairs': 867, 'min_sentence1_length': 5, 'average_sentence1_length': 42.59, 'max_sentence1_length': 225, 'unique_sentence1': 866, 'min_sentence2_length': 10, 'average_sentence2_length': 109.81, 'max_sentence2_length': 462, 'unique_sentence2': 864}, 'ko-en': {'num_samples': 879, 'number_of_characters': 142659, 'unique_pairs': 874, 'min_sentence1_length': 3, 'average_sentence1_length': 54.56, 'max_sentence1_length': 250, 'unique_sentence1': 872, 'min_sentence2_length': 10, 'average_sentence2_length': 107.74, 'max_sentence2_length': 462, 'unique_sentence2': 872}, 'nl-en': {'num_samples': 1003, 'number_of_characters': 189637, 'unique_pairs': 1000, 'min_sentence1_length': 4, 'average_sentence1_length': 93.8, 'max_sentence1_length': 477, 'unique_sentence1': 1000, 'min_sentence2_length': 10, 'average_sentence2_length': 95.27, 'max_sentence2_length': 433, 'unique_sentence2': 996}, 'nl-it': {'num_samples': 1001, 'number_of_characters': 188858, 'unique_pairs': 998, 'min_sentence1_length': 7, 'average_sentence1_length': 94.03, 'max_sentence1_length': 505, 'unique_sentence1': 998, 'min_sentence2_length': 7, 'average_sentence2_length': 94.64, 'max_sentence2_length': 459, 'unique_sentence2': 994}, 'nl-ro': {'num_samples': 913, 'number_of_characters': 191376, 'unique_pairs': 911, 'min_sentence1_length': 7, 'average_sentence1_length': 102.02, 'max_sentence1_length': 478, 'unique_sentence1': 909, 'min_sentence2_length': 9, 'average_sentence2_length': 107.59, 'max_sentence2_length': 515, 'unique_sentence2': 909}, 'ro-en': {'num_samples': 914, 'number_of_characters': 194128, 'unique_pairs': 910, 'min_sentence1_length': 9, 'average_sentence1_length': 107.67, 'max_sentence1_length': 448, 'unique_sentence1': 910, 'min_sentence2_length': 10, 'average_sentence2_length': 104.72, 'max_sentence2_length': 433, 'unique_sentence2': 907}, 'ro-it': {'num_samples': 914, 'number_of_characters': 193339, 'unique_pairs': 911, 'min_sentence1_length': 9, 'average_sentence1_length': 107.62, 'max_sentence1_length': 448, 'unique_sentence1': 910, 'min_sentence2_length': 7, 'average_sentence2_length': 103.91, 'max_sentence2_length': 435, 'unique_sentence2': 907}, 'ro-nl': {'num_samples': 913, 'number_of_characters': 191376, 'unique_pairs': 911, 'min_sentence1_length': 9, 'average_sentence1_length': 107.59, 'max_sentence1_length': 515, 'unique_sentence1': 909, 'min_sentence2_length': 7, 'average_sentence2_length': 102.02, 'max_sentence2_length': 478, 'unique_sentence2': 909}, 'zh-en': {'num_samples': 879, 'number_of_characters': 131126, 'unique_pairs': 877, 'min_sentence1_length': 2, 'average_sentence1_length': 39.81, 'max_sentence1_length': 230, 'unique_sentence1': 867, 'min_sentence2_length': 10, 'average_sentence2_length': 109.37, 'max_sentence2_length': 462, 'unique_sentence2': 872}}}} | +| [IN22ConvBitextMining](https://huggingface.co/datasets/ai4bharat/IN22-Conv) (Jay Gala, 2023) | ['asm', 'ben', 'brx', 'doi', 'eng', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | BitextMining | s2s | [Fiction, Social, Spoken, Spoken] | {'test': 760518} | {'test': {'num_samples': 760518, 'number_of_characters': 82637104, 'unique_pairs': 759283, 'min_sentence1_length': 3, 'average_sentence1_length': 54.33, 'max_sentence1_length': 239, 'unique_sentence1': 34430, 'min_sentence2_length': 3, 'average_sentence2_length': 54.33, 'max_sentence2_length': 239, 'unique_sentence2': 34430, 'hf_subset_descriptive_stats': {'asm_Beng-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155988, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'asm_Beng-brx_Deva': {'num_samples': 1503, 'number_of_characters': 162044, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'asm_Beng-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167032, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'asm_Beng-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160716, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'asm_Beng-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156282, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'asm_Beng-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 158269, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'asm_Beng-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159964, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'asm_Beng-kan_Knda': {'num_samples': 1503, 'number_of_characters': 165177, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'asm_Beng-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164681, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'asm_Beng-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162408, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'asm_Beng-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172838, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'asm_Beng-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162747, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'asm_Beng-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157316, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'asm_Beng-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160906, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'asm_Beng-ory_Orya': {'num_samples': 1503, 'number_of_characters': 164223, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'asm_Beng-pan_Guru': {'num_samples': 1503, 'number_of_characters': 160201, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'asm_Beng-san_Deva': {'num_samples': 1503, 'number_of_characters': 158093, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'asm_Beng-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169379, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'asm_Beng-snd_Deva': {'num_samples': 1503, 'number_of_characters': 162623, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'asm_Beng-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174866, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'asm_Beng-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157690, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'asm_Beng-urd_Arab': {'num_samples': 1503, 'number_of_characters': 161305, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'ben_Beng-asm_Beng': {'num_samples': 1503, 'number_of_characters': 155988, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'ben_Beng-brx_Deva': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'ben_Beng-doi_Deva': {'num_samples': 1503, 'number_of_characters': 161436, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'ben_Beng-eng_Latn': {'num_samples': 1503, 'number_of_characters': 155120, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'ben_Beng-gom_Deva': {'num_samples': 1503, 'number_of_characters': 150686, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'ben_Beng-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 152673, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'ben_Beng-hin_Deva': {'num_samples': 1503, 'number_of_characters': 154368, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'ben_Beng-kan_Knda': {'num_samples': 1503, 'number_of_characters': 159581, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'ben_Beng-kas_Arab': {'num_samples': 1503, 'number_of_characters': 159085, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'ben_Beng-mai_Deva': {'num_samples': 1503, 'number_of_characters': 156812, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'ben_Beng-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 167242, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'ben_Beng-mar_Deva': {'num_samples': 1503, 'number_of_characters': 157151, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'ben_Beng-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 151720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'ben_Beng-npi_Deva': {'num_samples': 1503, 'number_of_characters': 155310, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'ben_Beng-ory_Orya': {'num_samples': 1503, 'number_of_characters': 158627, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'ben_Beng-pan_Guru': {'num_samples': 1503, 'number_of_characters': 154605, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'ben_Beng-san_Deva': {'num_samples': 1503, 'number_of_characters': 152497, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'ben_Beng-sat_Olck': {'num_samples': 1503, 'number_of_characters': 163783, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'ben_Beng-snd_Deva': {'num_samples': 1503, 'number_of_characters': 157027, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'ben_Beng-tam_Taml': {'num_samples': 1503, 'number_of_characters': 169270, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'ben_Beng-tel_Telu': {'num_samples': 1503, 'number_of_characters': 152094, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'ben_Beng-urd_Arab': {'num_samples': 1503, 'number_of_characters': 155709, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'brx_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162044, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'brx_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'brx_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167492, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'brx_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161176, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'brx_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156742, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'brx_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'brx_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 160424, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'brx_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 165637, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'brx_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165141, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'brx_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162868, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'brx_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'brx_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163207, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'brx_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157776, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'brx_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'brx_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 164683, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'brx_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 160661, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'brx_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 158553, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'brx_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169839, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'brx_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163083, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'brx_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175326, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'brx_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158150, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'brx_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 161765, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'doi_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 167032, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'doi_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 161436, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'doi_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 167492, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'doi_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 166164, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'doi_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'doi_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 163717, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'doi_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 165412, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'doi_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 170625, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'doi_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 170129, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'doi_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 167856, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'doi_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 178286, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'doi_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 168195, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'doi_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 162764, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'doi_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 166354, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'doi_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 169671, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'doi_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 165649, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'doi_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 163541, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'doi_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 174827, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'doi_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 168071, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'doi_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 180314, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'doi_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 163138, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'doi_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 166753, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'eng_Latn-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160716, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'eng_Latn-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155120, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'eng_Latn-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161176, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'eng_Latn-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166164, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'eng_Latn-gom_Deva': {'num_samples': 1503, 'number_of_characters': 155414, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'eng_Latn-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157401, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'eng_Latn-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159096, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'eng_Latn-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164309, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'eng_Latn-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163813, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'eng_Latn-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161540, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'eng_Latn-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171970, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'eng_Latn-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161879, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'eng_Latn-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'eng_Latn-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160038, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'eng_Latn-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163355, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'eng_Latn-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159333, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'eng_Latn-san_Deva': {'num_samples': 1503, 'number_of_characters': 157225, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'eng_Latn-sat_Olck': {'num_samples': 1503, 'number_of_characters': 168511, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'eng_Latn-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161755, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'eng_Latn-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173998, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'eng_Latn-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156822, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'eng_Latn-urd_Arab': {'num_samples': 1503, 'number_of_characters': 160437, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'gom_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 156282, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'gom_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 150686, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'gom_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 156742, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'gom_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'gom_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 155414, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'gom_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 152967, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'gom_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 154662, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'gom_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 159875, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'gom_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 159379, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'gom_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 157106, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'gom_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 167536, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'gom_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 157445, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'gom_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 152014, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'gom_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 155604, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'gom_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 158921, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'gom_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 154899, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'gom_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 152791, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'gom_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 164077, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'gom_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 157321, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'gom_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 169564, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'gom_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 152388, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'gom_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 156003, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'guj_Gujr-asm_Beng': {'num_samples': 1503, 'number_of_characters': 158269, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'guj_Gujr-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152673, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'guj_Gujr-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'guj_Gujr-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163717, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'guj_Gujr-eng_Latn': {'num_samples': 1503, 'number_of_characters': 157401, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'guj_Gujr-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152967, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'guj_Gujr-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156649, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'guj_Gujr-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161862, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'guj_Gujr-kas_Arab': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'guj_Gujr-mai_Deva': {'num_samples': 1503, 'number_of_characters': 159093, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'guj_Gujr-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 169523, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'guj_Gujr-mar_Deva': {'num_samples': 1503, 'number_of_characters': 159432, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'guj_Gujr-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 154001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'guj_Gujr-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157591, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'guj_Gujr-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160908, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'guj_Gujr-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156886, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'guj_Gujr-san_Deva': {'num_samples': 1503, 'number_of_characters': 154778, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'guj_Gujr-sat_Olck': {'num_samples': 1503, 'number_of_characters': 166064, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'guj_Gujr-snd_Deva': {'num_samples': 1503, 'number_of_characters': 159308, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'guj_Gujr-tam_Taml': {'num_samples': 1503, 'number_of_characters': 171551, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'guj_Gujr-tel_Telu': {'num_samples': 1503, 'number_of_characters': 154375, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'guj_Gujr-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157990, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'hin_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 159964, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'hin_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 154368, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'hin_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 160424, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'hin_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 165412, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'hin_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 159096, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'hin_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 154662, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'hin_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 156649, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'hin_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 163557, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'hin_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163061, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'hin_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 160788, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'hin_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171218, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'hin_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161127, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'hin_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 155696, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'hin_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 159286, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'hin_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 162603, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'hin_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 158581, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'hin_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 156473, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'hin_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 167759, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'hin_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'hin_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173246, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'hin_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156070, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'hin_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 159685, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'kan_Knda-asm_Beng': {'num_samples': 1503, 'number_of_characters': 165177, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'kan_Knda-ben_Beng': {'num_samples': 1503, 'number_of_characters': 159581, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'kan_Knda-brx_Deva': {'num_samples': 1503, 'number_of_characters': 165637, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'kan_Knda-doi_Deva': {'num_samples': 1503, 'number_of_characters': 170625, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'kan_Knda-eng_Latn': {'num_samples': 1503, 'number_of_characters': 164309, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'kan_Knda-gom_Deva': {'num_samples': 1503, 'number_of_characters': 159875, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'kan_Knda-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 161862, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'kan_Knda-hin_Deva': {'num_samples': 1503, 'number_of_characters': 163557, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'kan_Knda-kas_Arab': {'num_samples': 1503, 'number_of_characters': 168274, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'kan_Knda-mai_Deva': {'num_samples': 1503, 'number_of_characters': 166001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'kan_Knda-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 176431, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'kan_Knda-mar_Deva': {'num_samples': 1503, 'number_of_characters': 166340, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'kan_Knda-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 160909, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'kan_Knda-npi_Deva': {'num_samples': 1503, 'number_of_characters': 164499, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'kan_Knda-ory_Orya': {'num_samples': 1503, 'number_of_characters': 167816, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'kan_Knda-pan_Guru': {'num_samples': 1503, 'number_of_characters': 163794, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'kan_Knda-san_Deva': {'num_samples': 1503, 'number_of_characters': 161686, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'kan_Knda-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172972, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'kan_Knda-snd_Deva': {'num_samples': 1503, 'number_of_characters': 166216, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'kan_Knda-tam_Taml': {'num_samples': 1503, 'number_of_characters': 178459, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'kan_Knda-tel_Telu': {'num_samples': 1503, 'number_of_characters': 161283, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'kan_Knda-urd_Arab': {'num_samples': 1503, 'number_of_characters': 164898, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'kas_Arab-asm_Beng': {'num_samples': 1503, 'number_of_characters': 164681, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'kas_Arab-ben_Beng': {'num_samples': 1503, 'number_of_characters': 159085, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'kas_Arab-brx_Deva': {'num_samples': 1503, 'number_of_characters': 165141, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'kas_Arab-doi_Deva': {'num_samples': 1503, 'number_of_characters': 170129, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'kas_Arab-eng_Latn': {'num_samples': 1503, 'number_of_characters': 163813, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'kas_Arab-gom_Deva': {'num_samples': 1503, 'number_of_characters': 159379, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'kas_Arab-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'kas_Arab-hin_Deva': {'num_samples': 1503, 'number_of_characters': 163061, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'kas_Arab-kan_Knda': {'num_samples': 1503, 'number_of_characters': 168274, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'kas_Arab-mai_Deva': {'num_samples': 1503, 'number_of_characters': 165505, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'kas_Arab-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 175935, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'kas_Arab-mar_Deva': {'num_samples': 1503, 'number_of_characters': 165844, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'kas_Arab-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 160413, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'kas_Arab-npi_Deva': {'num_samples': 1503, 'number_of_characters': 164003, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'kas_Arab-ory_Orya': {'num_samples': 1503, 'number_of_characters': 167320, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'kas_Arab-pan_Guru': {'num_samples': 1503, 'number_of_characters': 163298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'kas_Arab-san_Deva': {'num_samples': 1503, 'number_of_characters': 161190, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'kas_Arab-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172476, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'kas_Arab-snd_Deva': {'num_samples': 1503, 'number_of_characters': 165720, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'kas_Arab-tam_Taml': {'num_samples': 1503, 'number_of_characters': 177963, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'kas_Arab-tel_Telu': {'num_samples': 1503, 'number_of_characters': 160787, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'kas_Arab-urd_Arab': {'num_samples': 1503, 'number_of_characters': 164402, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mai_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162408, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mai_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 156812, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mai_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 162868, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mai_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167856, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mai_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161540, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mai_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157106, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mai_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159093, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mai_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 160788, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mai_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166001, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mai_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165505, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mai_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173662, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mai_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163571, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mai_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158140, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mai_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mai_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165047, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mai_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161025, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mai_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 158917, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mai_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170203, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mai_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163447, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mai_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175690, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mai_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158514, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mai_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162129, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mal_Mlym-asm_Beng': {'num_samples': 1503, 'number_of_characters': 172838, 'unique_pairs': 1498, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mal_Mlym-ben_Beng': {'num_samples': 1503, 'number_of_characters': 167242, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mal_Mlym-brx_Deva': {'num_samples': 1503, 'number_of_characters': 173298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mal_Mlym-doi_Deva': {'num_samples': 1503, 'number_of_characters': 178286, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mal_Mlym-eng_Latn': {'num_samples': 1503, 'number_of_characters': 171970, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mal_Mlym-gom_Deva': {'num_samples': 1503, 'number_of_characters': 167536, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mal_Mlym-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 169523, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mal_Mlym-hin_Deva': {'num_samples': 1503, 'number_of_characters': 171218, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mal_Mlym-kan_Knda': {'num_samples': 1503, 'number_of_characters': 176431, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mal_Mlym-kas_Arab': {'num_samples': 1503, 'number_of_characters': 175935, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mal_Mlym-mai_Deva': {'num_samples': 1503, 'number_of_characters': 173662, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mal_Mlym-mar_Deva': {'num_samples': 1503, 'number_of_characters': 174001, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mal_Mlym-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 168570, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mal_Mlym-npi_Deva': {'num_samples': 1503, 'number_of_characters': 172160, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mal_Mlym-ory_Orya': {'num_samples': 1503, 'number_of_characters': 175477, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mal_Mlym-pan_Guru': {'num_samples': 1503, 'number_of_characters': 171455, 'unique_pairs': 1498, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mal_Mlym-san_Deva': {'num_samples': 1503, 'number_of_characters': 169347, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mal_Mlym-sat_Olck': {'num_samples': 1503, 'number_of_characters': 180633, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mal_Mlym-snd_Deva': {'num_samples': 1503, 'number_of_characters': 173877, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mal_Mlym-tam_Taml': {'num_samples': 1503, 'number_of_characters': 186120, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mal_Mlym-tel_Telu': {'num_samples': 1503, 'number_of_characters': 168944, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mal_Mlym-urd_Arab': {'num_samples': 1503, 'number_of_characters': 172559, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mar_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162747, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mar_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 157151, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mar_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 163207, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mar_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 168195, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mar_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161879, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mar_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157445, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mar_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159432, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mar_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 161127, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mar_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166340, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mar_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165844, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mar_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 163571, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mar_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 174001, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mar_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158479, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mar_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 162069, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mar_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165386, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mar_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161364, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mar_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 159256, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mar_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170542, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mar_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163786, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mar_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 176029, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mar_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158853, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mar_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162468, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mni_Mtei-asm_Beng': {'num_samples': 1503, 'number_of_characters': 157316, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mni_Mtei-ben_Beng': {'num_samples': 1503, 'number_of_characters': 151720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mni_Mtei-brx_Deva': {'num_samples': 1503, 'number_of_characters': 157776, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mni_Mtei-doi_Deva': {'num_samples': 1503, 'number_of_characters': 162764, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mni_Mtei-eng_Latn': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mni_Mtei-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152014, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mni_Mtei-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mni_Mtei-hin_Deva': {'num_samples': 1503, 'number_of_characters': 155696, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mni_Mtei-kan_Knda': {'num_samples': 1503, 'number_of_characters': 160909, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mni_Mtei-kas_Arab': {'num_samples': 1503, 'number_of_characters': 160413, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mni_Mtei-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158140, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mni_Mtei-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 168570, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mni_Mtei-mar_Deva': {'num_samples': 1503, 'number_of_characters': 158479, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mni_Mtei-npi_Deva': {'num_samples': 1503, 'number_of_characters': 156638, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mni_Mtei-ory_Orya': {'num_samples': 1503, 'number_of_characters': 159955, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mni_Mtei-pan_Guru': {'num_samples': 1503, 'number_of_characters': 155933, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mni_Mtei-san_Deva': {'num_samples': 1503, 'number_of_characters': 153825, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mni_Mtei-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165111, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mni_Mtei-snd_Deva': {'num_samples': 1503, 'number_of_characters': 158355, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mni_Mtei-tam_Taml': {'num_samples': 1503, 'number_of_characters': 170598, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mni_Mtei-tel_Telu': {'num_samples': 1503, 'number_of_characters': 153422, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mni_Mtei-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157037, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'npi_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160906, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'npi_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155310, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'npi_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'npi_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166354, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'npi_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160038, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'npi_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 155604, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'npi_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157591, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'npi_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159286, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'npi_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164499, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'npi_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'npi_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'npi_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172160, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'npi_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162069, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'npi_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 156638, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'npi_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163545, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'npi_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159523, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'npi_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 157415, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'npi_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 168701, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'npi_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161945, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'npi_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174188, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'npi_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157012, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'npi_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 160627, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'ory_Orya-asm_Beng': {'num_samples': 1503, 'number_of_characters': 164223, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'ory_Orya-ben_Beng': {'num_samples': 1503, 'number_of_characters': 158627, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'ory_Orya-brx_Deva': {'num_samples': 1503, 'number_of_characters': 164683, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'ory_Orya-doi_Deva': {'num_samples': 1503, 'number_of_characters': 169671, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'ory_Orya-eng_Latn': {'num_samples': 1503, 'number_of_characters': 163355, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'ory_Orya-gom_Deva': {'num_samples': 1503, 'number_of_characters': 158921, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'ory_Orya-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 160908, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'ory_Orya-hin_Deva': {'num_samples': 1503, 'number_of_characters': 162603, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'ory_Orya-kan_Knda': {'num_samples': 1503, 'number_of_characters': 167816, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'ory_Orya-kas_Arab': {'num_samples': 1503, 'number_of_characters': 167320, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'ory_Orya-mai_Deva': {'num_samples': 1503, 'number_of_characters': 165047, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'ory_Orya-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 175477, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'ory_Orya-mar_Deva': {'num_samples': 1503, 'number_of_characters': 165386, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'ory_Orya-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 159955, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'ory_Orya-npi_Deva': {'num_samples': 1503, 'number_of_characters': 163545, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'ory_Orya-pan_Guru': {'num_samples': 1503, 'number_of_characters': 162840, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'ory_Orya-san_Deva': {'num_samples': 1503, 'number_of_characters': 160732, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'ory_Orya-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172018, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'ory_Orya-snd_Deva': {'num_samples': 1503, 'number_of_characters': 165262, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'ory_Orya-tam_Taml': {'num_samples': 1503, 'number_of_characters': 177505, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'ory_Orya-tel_Telu': {'num_samples': 1503, 'number_of_characters': 160329, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'ory_Orya-urd_Arab': {'num_samples': 1503, 'number_of_characters': 163944, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'pan_Guru-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160201, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'pan_Guru-ben_Beng': {'num_samples': 1503, 'number_of_characters': 154605, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'pan_Guru-brx_Deva': {'num_samples': 1503, 'number_of_characters': 160661, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'pan_Guru-doi_Deva': {'num_samples': 1503, 'number_of_characters': 165649, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'pan_Guru-eng_Latn': {'num_samples': 1503, 'number_of_characters': 159333, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'pan_Guru-gom_Deva': {'num_samples': 1503, 'number_of_characters': 154899, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'pan_Guru-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 156886, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'pan_Guru-hin_Deva': {'num_samples': 1503, 'number_of_characters': 158581, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'pan_Guru-kan_Knda': {'num_samples': 1503, 'number_of_characters': 163794, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'pan_Guru-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163298, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'pan_Guru-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161025, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'pan_Guru-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171455, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'pan_Guru-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161364, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'pan_Guru-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 155933, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'pan_Guru-npi_Deva': {'num_samples': 1503, 'number_of_characters': 159523, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'pan_Guru-ory_Orya': {'num_samples': 1503, 'number_of_characters': 162840, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'pan_Guru-san_Deva': {'num_samples': 1503, 'number_of_characters': 156710, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'pan_Guru-sat_Olck': {'num_samples': 1503, 'number_of_characters': 167996, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'pan_Guru-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161240, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'pan_Guru-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173483, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'pan_Guru-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156307, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'pan_Guru-urd_Arab': {'num_samples': 1503, 'number_of_characters': 159922, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'san_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 158093, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'san_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152497, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'san_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158553, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'san_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163541, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'san_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 157225, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'san_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152791, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'san_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154778, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'san_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156473, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'san_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161686, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'san_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 161190, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'san_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158917, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'san_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 169347, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'san_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 159256, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'san_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 153825, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'san_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157415, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'san_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160732, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'san_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156710, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'san_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165888, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'san_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 159132, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'san_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 171375, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'san_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 154199, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'san_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157814, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'sat_Olck-asm_Beng': {'num_samples': 1503, 'number_of_characters': 169379, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'sat_Olck-ben_Beng': {'num_samples': 1503, 'number_of_characters': 163783, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'sat_Olck-brx_Deva': {'num_samples': 1503, 'number_of_characters': 169839, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'sat_Olck-doi_Deva': {'num_samples': 1503, 'number_of_characters': 174827, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'sat_Olck-eng_Latn': {'num_samples': 1503, 'number_of_characters': 168511, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'sat_Olck-gom_Deva': {'num_samples': 1503, 'number_of_characters': 164077, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'sat_Olck-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 166064, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'sat_Olck-hin_Deva': {'num_samples': 1503, 'number_of_characters': 167759, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'sat_Olck-kan_Knda': {'num_samples': 1503, 'number_of_characters': 172972, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'sat_Olck-kas_Arab': {'num_samples': 1503, 'number_of_characters': 172476, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'sat_Olck-mai_Deva': {'num_samples': 1503, 'number_of_characters': 170203, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'sat_Olck-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 180633, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'sat_Olck-mar_Deva': {'num_samples': 1503, 'number_of_characters': 170542, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'sat_Olck-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 165111, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'sat_Olck-npi_Deva': {'num_samples': 1503, 'number_of_characters': 168701, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'sat_Olck-ory_Orya': {'num_samples': 1503, 'number_of_characters': 172018, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'sat_Olck-pan_Guru': {'num_samples': 1503, 'number_of_characters': 167996, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'sat_Olck-san_Deva': {'num_samples': 1503, 'number_of_characters': 165888, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'sat_Olck-snd_Deva': {'num_samples': 1503, 'number_of_characters': 170418, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'sat_Olck-tam_Taml': {'num_samples': 1503, 'number_of_characters': 182661, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'sat_Olck-tel_Telu': {'num_samples': 1503, 'number_of_characters': 165485, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'sat_Olck-urd_Arab': {'num_samples': 1503, 'number_of_characters': 169100, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'snd_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162623, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'snd_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 157027, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'snd_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 163083, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'snd_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 168071, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'snd_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161755, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'snd_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157321, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'snd_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159308, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'snd_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 161003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'snd_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166216, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'snd_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'snd_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 163447, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'snd_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173877, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'snd_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163786, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'snd_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158355, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'snd_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161945, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'snd_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165262, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'snd_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161240, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'snd_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 159132, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'snd_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170418, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'snd_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175905, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'snd_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'snd_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162344, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'tam_Taml-asm_Beng': {'num_samples': 1503, 'number_of_characters': 174866, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'tam_Taml-ben_Beng': {'num_samples': 1503, 'number_of_characters': 169270, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'tam_Taml-brx_Deva': {'num_samples': 1503, 'number_of_characters': 175326, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'tam_Taml-doi_Deva': {'num_samples': 1503, 'number_of_characters': 180314, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'tam_Taml-eng_Latn': {'num_samples': 1503, 'number_of_characters': 173998, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'tam_Taml-gom_Deva': {'num_samples': 1503, 'number_of_characters': 169564, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'tam_Taml-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 171551, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'tam_Taml-hin_Deva': {'num_samples': 1503, 'number_of_characters': 173246, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'tam_Taml-kan_Knda': {'num_samples': 1503, 'number_of_characters': 178459, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'tam_Taml-kas_Arab': {'num_samples': 1503, 'number_of_characters': 177963, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'tam_Taml-mai_Deva': {'num_samples': 1503, 'number_of_characters': 175690, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'tam_Taml-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 186120, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'tam_Taml-mar_Deva': {'num_samples': 1503, 'number_of_characters': 176029, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'tam_Taml-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 170598, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'tam_Taml-npi_Deva': {'num_samples': 1503, 'number_of_characters': 174188, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'tam_Taml-ory_Orya': {'num_samples': 1503, 'number_of_characters': 177505, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'tam_Taml-pan_Guru': {'num_samples': 1503, 'number_of_characters': 173483, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'tam_Taml-san_Deva': {'num_samples': 1503, 'number_of_characters': 171375, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'tam_Taml-sat_Olck': {'num_samples': 1503, 'number_of_characters': 182661, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'tam_Taml-snd_Deva': {'num_samples': 1503, 'number_of_characters': 175905, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'tam_Taml-tel_Telu': {'num_samples': 1503, 'number_of_characters': 170972, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'tam_Taml-urd_Arab': {'num_samples': 1503, 'number_of_characters': 174587, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'tel_Telu-asm_Beng': {'num_samples': 1503, 'number_of_characters': 157690, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'tel_Telu-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152094, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'tel_Telu-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158150, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'tel_Telu-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163138, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'tel_Telu-eng_Latn': {'num_samples': 1503, 'number_of_characters': 156822, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'tel_Telu-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152388, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'tel_Telu-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154375, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'tel_Telu-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156070, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'tel_Telu-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161283, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'tel_Telu-kas_Arab': {'num_samples': 1503, 'number_of_characters': 160787, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'tel_Telu-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158514, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'tel_Telu-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 168944, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'tel_Telu-mar_Deva': {'num_samples': 1503, 'number_of_characters': 158853, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'tel_Telu-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 153422, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'tel_Telu-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157012, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'tel_Telu-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160329, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'tel_Telu-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156307, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'tel_Telu-san_Deva': {'num_samples': 1503, 'number_of_characters': 154199, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'tel_Telu-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165485, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'tel_Telu-snd_Deva': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'tel_Telu-tam_Taml': {'num_samples': 1503, 'number_of_characters': 170972, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'tel_Telu-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157411, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'urd_Arab-asm_Beng': {'num_samples': 1503, 'number_of_characters': 161305, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'urd_Arab-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155709, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'urd_Arab-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161765, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'urd_Arab-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166753, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'urd_Arab-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160437, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'urd_Arab-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'urd_Arab-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157990, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'urd_Arab-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159685, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'urd_Arab-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164898, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'urd_Arab-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164402, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'urd_Arab-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162129, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'urd_Arab-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172559, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'urd_Arab-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162468, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'urd_Arab-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157037, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'urd_Arab-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160627, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'urd_Arab-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163944, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'urd_Arab-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159922, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'urd_Arab-san_Deva': {'num_samples': 1503, 'number_of_characters': 157814, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'urd_Arab-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169100, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'urd_Arab-snd_Deva': {'num_samples': 1503, 'number_of_characters': 162344, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'urd_Arab-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174587, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'urd_Arab-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157411, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}}}} | +| [IN22GenBitextMining](https://huggingface.co/datasets/ai4bharat/IN22-Gen) (Jay Gala, 2023) | ['asm', 'ben', 'brx', 'doi', 'eng', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | BitextMining | s2s | [Government, Legal, News, Non-fiction, Religious, Web, Written] | {'test': 518144} | {'test': {'num_samples': 518144, 'number_of_characters': 162367876, 'unique_pairs': 518101, 'min_sentence1_length': 9, 'average_sentence1_length': 156.68, 'max_sentence1_length': 692, 'unique_sentence1': 23550, 'min_sentence2_length': 9, 'average_sentence2_length': 156.68, 'max_sentence2_length': 692, 'unique_sentence2': 23550, 'hf_subset_descriptive_stats': {'asm_Beng-ben_Beng': {'num_samples': 1024, 'number_of_characters': 310622, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'asm_Beng-brx_Deva': {'num_samples': 1024, 'number_of_characters': 323609, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'asm_Beng-doi_Deva': {'num_samples': 1024, 'number_of_characters': 319020, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'asm_Beng-eng_Latn': {'num_samples': 1024, 'number_of_characters': 320098, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'asm_Beng-gom_Deva': {'num_samples': 1024, 'number_of_characters': 312594, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'asm_Beng-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 309440, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'asm_Beng-hin_Deva': {'num_samples': 1024, 'number_of_characters': 320106, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'asm_Beng-kan_Knda': {'num_samples': 1024, 'number_of_characters': 332064, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'asm_Beng-kas_Arab': {'num_samples': 1024, 'number_of_characters': 322764, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'asm_Beng-mai_Deva': {'num_samples': 1024, 'number_of_characters': 308682, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'asm_Beng-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 343636, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'asm_Beng-mar_Deva': {'num_samples': 1024, 'number_of_characters': 321784, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'asm_Beng-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 313134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'asm_Beng-npi_Deva': {'num_samples': 1024, 'number_of_characters': 313419, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'asm_Beng-ory_Orya': {'num_samples': 1024, 'number_of_characters': 334226, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'asm_Beng-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306863, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'asm_Beng-san_Deva': {'num_samples': 1024, 'number_of_characters': 318079, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'asm_Beng-sat_Olck': {'num_samples': 1024, 'number_of_characters': 326732, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'asm_Beng-snd_Deva': {'num_samples': 1024, 'number_of_characters': 320421, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'asm_Beng-tam_Taml': {'num_samples': 1024, 'number_of_characters': 348346, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'asm_Beng-tel_Telu': {'num_samples': 1024, 'number_of_characters': 319045, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'asm_Beng-urd_Arab': {'num_samples': 1024, 'number_of_characters': 315134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'ben_Beng-asm_Beng': {'num_samples': 1024, 'number_of_characters': 310622, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'ben_Beng-brx_Deva': {'num_samples': 1024, 'number_of_characters': 313313, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'ben_Beng-doi_Deva': {'num_samples': 1024, 'number_of_characters': 308724, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'ben_Beng-eng_Latn': {'num_samples': 1024, 'number_of_characters': 309802, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'ben_Beng-gom_Deva': {'num_samples': 1024, 'number_of_characters': 302298, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'ben_Beng-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 299144, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'ben_Beng-hin_Deva': {'num_samples': 1024, 'number_of_characters': 309810, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'ben_Beng-kan_Knda': {'num_samples': 1024, 'number_of_characters': 321768, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'ben_Beng-kas_Arab': {'num_samples': 1024, 'number_of_characters': 312468, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'ben_Beng-mai_Deva': {'num_samples': 1024, 'number_of_characters': 298386, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'ben_Beng-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 333340, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'ben_Beng-mar_Deva': {'num_samples': 1024, 'number_of_characters': 311488, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'ben_Beng-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 302838, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'ben_Beng-npi_Deva': {'num_samples': 1024, 'number_of_characters': 303123, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'ben_Beng-ory_Orya': {'num_samples': 1024, 'number_of_characters': 323930, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'ben_Beng-pan_Guru': {'num_samples': 1024, 'number_of_characters': 296567, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'ben_Beng-san_Deva': {'num_samples': 1024, 'number_of_characters': 307783, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'ben_Beng-sat_Olck': {'num_samples': 1024, 'number_of_characters': 316436, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'ben_Beng-snd_Deva': {'num_samples': 1024, 'number_of_characters': 310125, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'ben_Beng-tam_Taml': {'num_samples': 1024, 'number_of_characters': 338050, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'ben_Beng-tel_Telu': {'num_samples': 1024, 'number_of_characters': 308749, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'ben_Beng-urd_Arab': {'num_samples': 1024, 'number_of_characters': 304838, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'brx_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 323609, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'brx_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 313313, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'brx_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 321711, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'brx_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 322789, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'brx_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 315285, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'brx_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 312131, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'brx_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 322797, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'brx_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 334755, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'brx_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 325455, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'brx_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 311373, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'brx_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 346327, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'brx_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 324475, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'brx_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 315825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'brx_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 316110, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'brx_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 336917, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'brx_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 309554, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'brx_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 320770, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'brx_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 329423, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'brx_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 323112, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'brx_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 351037, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'brx_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 321736, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'brx_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 317825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'doi_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 319020, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'doi_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 308724, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'doi_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 321711, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'doi_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 318200, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'doi_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 310696, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'doi_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 307542, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'doi_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 318208, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'doi_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 330166, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'doi_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 320866, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'doi_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 306784, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'doi_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 341738, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'doi_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 319886, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'doi_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 311236, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'doi_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 311521, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'doi_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 332328, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'doi_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304965, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'doi_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 316181, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'doi_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 324834, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'doi_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 318523, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'doi_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 346448, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'doi_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 317147, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'doi_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 313236, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'eng_Latn-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320098, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'eng_Latn-ben_Beng': {'num_samples': 1024, 'number_of_characters': 309802, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'eng_Latn-brx_Deva': {'num_samples': 1024, 'number_of_characters': 322789, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'eng_Latn-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318200, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'eng_Latn-gom_Deva': {'num_samples': 1024, 'number_of_characters': 311774, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'eng_Latn-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308620, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'eng_Latn-hin_Deva': {'num_samples': 1024, 'number_of_characters': 319286, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'eng_Latn-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331244, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'eng_Latn-kas_Arab': {'num_samples': 1024, 'number_of_characters': 321944, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'eng_Latn-mai_Deva': {'num_samples': 1024, 'number_of_characters': 307862, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'eng_Latn-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 342816, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'eng_Latn-mar_Deva': {'num_samples': 1024, 'number_of_characters': 320964, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'eng_Latn-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312314, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'eng_Latn-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312599, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'eng_Latn-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333406, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'eng_Latn-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306043, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'eng_Latn-san_Deva': {'num_samples': 1024, 'number_of_characters': 317259, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'eng_Latn-sat_Olck': {'num_samples': 1024, 'number_of_characters': 325912, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'eng_Latn-snd_Deva': {'num_samples': 1024, 'number_of_characters': 319601, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'eng_Latn-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347526, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'eng_Latn-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318225, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'eng_Latn-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314314, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'gom_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 312594, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'gom_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 302298, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'gom_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 315285, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'gom_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 310696, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'gom_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 311774, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'gom_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301116, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'gom_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 311782, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'gom_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 323740, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'gom_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 314440, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'gom_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 300358, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'gom_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 335312, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'gom_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 313460, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'gom_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 304810, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'gom_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 305095, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'gom_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 325902, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'gom_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 298539, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'gom_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 309755, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'gom_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 318408, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'gom_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312097, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'gom_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340022, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'gom_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 310721, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'gom_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 306810, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'guj_Gujr-asm_Beng': {'num_samples': 1024, 'number_of_characters': 309440, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'guj_Gujr-ben_Beng': {'num_samples': 1024, 'number_of_characters': 299144, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'guj_Gujr-brx_Deva': {'num_samples': 1024, 'number_of_characters': 312131, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'guj_Gujr-doi_Deva': {'num_samples': 1024, 'number_of_characters': 307542, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'guj_Gujr-eng_Latn': {'num_samples': 1024, 'number_of_characters': 308620, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'guj_Gujr-gom_Deva': {'num_samples': 1024, 'number_of_characters': 301116, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'guj_Gujr-hin_Deva': {'num_samples': 1024, 'number_of_characters': 308628, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'guj_Gujr-kan_Knda': {'num_samples': 1024, 'number_of_characters': 320586, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'guj_Gujr-kas_Arab': {'num_samples': 1024, 'number_of_characters': 311286, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'guj_Gujr-mai_Deva': {'num_samples': 1024, 'number_of_characters': 297204, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'guj_Gujr-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 332158, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'guj_Gujr-mar_Deva': {'num_samples': 1024, 'number_of_characters': 310306, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'guj_Gujr-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 301656, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'guj_Gujr-npi_Deva': {'num_samples': 1024, 'number_of_characters': 301941, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'guj_Gujr-ory_Orya': {'num_samples': 1024, 'number_of_characters': 322748, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'guj_Gujr-pan_Guru': {'num_samples': 1024, 'number_of_characters': 295385, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'guj_Gujr-san_Deva': {'num_samples': 1024, 'number_of_characters': 306601, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'guj_Gujr-sat_Olck': {'num_samples': 1024, 'number_of_characters': 315254, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'guj_Gujr-snd_Deva': {'num_samples': 1024, 'number_of_characters': 308943, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'guj_Gujr-tam_Taml': {'num_samples': 1024, 'number_of_characters': 336868, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'guj_Gujr-tel_Telu': {'num_samples': 1024, 'number_of_characters': 307567, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'guj_Gujr-urd_Arab': {'num_samples': 1024, 'number_of_characters': 303656, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'hin_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320106, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'hin_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 309810, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'hin_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 322797, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'hin_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318208, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'hin_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 319286, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'hin_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 311782, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'hin_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308628, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'hin_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331252, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'hin_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 321952, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'hin_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 307870, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'hin_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 342824, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'hin_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 320972, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'hin_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312322, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'hin_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312607, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'hin_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333414, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'hin_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306051, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'hin_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 317267, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'hin_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 325920, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'hin_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 319609, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'hin_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347534, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'hin_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318233, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'hin_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314322, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'kan_Knda-asm_Beng': {'num_samples': 1024, 'number_of_characters': 332064, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'kan_Knda-ben_Beng': {'num_samples': 1024, 'number_of_characters': 321768, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'kan_Knda-brx_Deva': {'num_samples': 1024, 'number_of_characters': 334755, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'kan_Knda-doi_Deva': {'num_samples': 1024, 'number_of_characters': 330166, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'kan_Knda-eng_Latn': {'num_samples': 1024, 'number_of_characters': 331244, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'kan_Knda-gom_Deva': {'num_samples': 1024, 'number_of_characters': 323740, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'kan_Knda-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 320586, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'kan_Knda-hin_Deva': {'num_samples': 1024, 'number_of_characters': 331252, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'kan_Knda-kas_Arab': {'num_samples': 1024, 'number_of_characters': 333910, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'kan_Knda-mai_Deva': {'num_samples': 1024, 'number_of_characters': 319828, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'kan_Knda-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 354782, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'kan_Knda-mar_Deva': {'num_samples': 1024, 'number_of_characters': 332930, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'kan_Knda-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 324280, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'kan_Knda-npi_Deva': {'num_samples': 1024, 'number_of_characters': 324565, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'kan_Knda-ory_Orya': {'num_samples': 1024, 'number_of_characters': 345372, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'kan_Knda-pan_Guru': {'num_samples': 1024, 'number_of_characters': 318009, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'kan_Knda-san_Deva': {'num_samples': 1024, 'number_of_characters': 329225, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'kan_Knda-sat_Olck': {'num_samples': 1024, 'number_of_characters': 337878, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'kan_Knda-snd_Deva': {'num_samples': 1024, 'number_of_characters': 331567, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'kan_Knda-tam_Taml': {'num_samples': 1024, 'number_of_characters': 359492, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'kan_Knda-tel_Telu': {'num_samples': 1024, 'number_of_characters': 330191, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'kan_Knda-urd_Arab': {'num_samples': 1024, 'number_of_characters': 326280, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'kas_Arab-asm_Beng': {'num_samples': 1024, 'number_of_characters': 322764, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'kas_Arab-ben_Beng': {'num_samples': 1024, 'number_of_characters': 312468, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'kas_Arab-brx_Deva': {'num_samples': 1024, 'number_of_characters': 325455, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'kas_Arab-doi_Deva': {'num_samples': 1024, 'number_of_characters': 320866, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'kas_Arab-eng_Latn': {'num_samples': 1024, 'number_of_characters': 321944, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'kas_Arab-gom_Deva': {'num_samples': 1024, 'number_of_characters': 314440, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'kas_Arab-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 311286, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'kas_Arab-hin_Deva': {'num_samples': 1024, 'number_of_characters': 321952, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'kas_Arab-kan_Knda': {'num_samples': 1024, 'number_of_characters': 333910, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'kas_Arab-mai_Deva': {'num_samples': 1024, 'number_of_characters': 310528, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'kas_Arab-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 345482, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'kas_Arab-mar_Deva': {'num_samples': 1024, 'number_of_characters': 323630, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'kas_Arab-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 314980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'kas_Arab-npi_Deva': {'num_samples': 1024, 'number_of_characters': 315265, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'kas_Arab-ory_Orya': {'num_samples': 1024, 'number_of_characters': 336072, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'kas_Arab-pan_Guru': {'num_samples': 1024, 'number_of_characters': 308709, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'kas_Arab-san_Deva': {'num_samples': 1024, 'number_of_characters': 319925, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'kas_Arab-sat_Olck': {'num_samples': 1024, 'number_of_characters': 328578, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'kas_Arab-snd_Deva': {'num_samples': 1024, 'number_of_characters': 322267, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'kas_Arab-tam_Taml': {'num_samples': 1024, 'number_of_characters': 350192, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'kas_Arab-tel_Telu': {'num_samples': 1024, 'number_of_characters': 320891, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'kas_Arab-urd_Arab': {'num_samples': 1024, 'number_of_characters': 316980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mai_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 308682, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mai_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 298386, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mai_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 311373, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mai_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 306784, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mai_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 307862, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mai_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 300358, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mai_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 297204, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mai_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 307870, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mai_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 319828, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mai_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 310528, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mai_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 331400, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mai_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 309548, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mai_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 300898, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mai_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 301183, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mai_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 321990, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mai_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 294627, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mai_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 305843, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mai_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 314496, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mai_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 308185, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mai_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 336110, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mai_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 306809, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mai_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 302898, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mal_Mlym-asm_Beng': {'num_samples': 1024, 'number_of_characters': 343636, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mal_Mlym-ben_Beng': {'num_samples': 1024, 'number_of_characters': 333340, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mal_Mlym-brx_Deva': {'num_samples': 1024, 'number_of_characters': 346327, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mal_Mlym-doi_Deva': {'num_samples': 1024, 'number_of_characters': 341738, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mal_Mlym-eng_Latn': {'num_samples': 1024, 'number_of_characters': 342816, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mal_Mlym-gom_Deva': {'num_samples': 1024, 'number_of_characters': 335312, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mal_Mlym-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 332158, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mal_Mlym-hin_Deva': {'num_samples': 1024, 'number_of_characters': 342824, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mal_Mlym-kan_Knda': {'num_samples': 1024, 'number_of_characters': 354782, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mal_Mlym-kas_Arab': {'num_samples': 1024, 'number_of_characters': 345482, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mal_Mlym-mai_Deva': {'num_samples': 1024, 'number_of_characters': 331400, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mal_Mlym-mar_Deva': {'num_samples': 1024, 'number_of_characters': 344502, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mal_Mlym-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 335852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mal_Mlym-npi_Deva': {'num_samples': 1024, 'number_of_characters': 336137, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mal_Mlym-ory_Orya': {'num_samples': 1024, 'number_of_characters': 356944, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mal_Mlym-pan_Guru': {'num_samples': 1024, 'number_of_characters': 329581, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mal_Mlym-san_Deva': {'num_samples': 1024, 'number_of_characters': 340797, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mal_Mlym-sat_Olck': {'num_samples': 1024, 'number_of_characters': 349450, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mal_Mlym-snd_Deva': {'num_samples': 1024, 'number_of_characters': 343139, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mal_Mlym-tam_Taml': {'num_samples': 1024, 'number_of_characters': 371064, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mal_Mlym-tel_Telu': {'num_samples': 1024, 'number_of_characters': 341763, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mal_Mlym-urd_Arab': {'num_samples': 1024, 'number_of_characters': 337852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mar_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 321784, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mar_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 311488, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mar_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 324475, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mar_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 319886, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mar_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 320964, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mar_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 313460, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mar_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 310306, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mar_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 320972, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mar_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 332930, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mar_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 323630, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mar_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 309548, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mar_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 344502, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mar_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 314000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mar_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 314285, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mar_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 335092, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mar_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 307729, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mar_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 318945, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mar_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 327598, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mar_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 321287, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mar_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 349212, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mar_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 319911, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mar_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 316000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mni_Mtei-asm_Beng': {'num_samples': 1024, 'number_of_characters': 313134, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mni_Mtei-ben_Beng': {'num_samples': 1024, 'number_of_characters': 302838, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mni_Mtei-brx_Deva': {'num_samples': 1024, 'number_of_characters': 315825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mni_Mtei-doi_Deva': {'num_samples': 1024, 'number_of_characters': 311236, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mni_Mtei-eng_Latn': {'num_samples': 1024, 'number_of_characters': 312314, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mni_Mtei-gom_Deva': {'num_samples': 1024, 'number_of_characters': 304810, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mni_Mtei-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301656, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mni_Mtei-hin_Deva': {'num_samples': 1024, 'number_of_characters': 312322, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mni_Mtei-kan_Knda': {'num_samples': 1024, 'number_of_characters': 324280, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mni_Mtei-kas_Arab': {'num_samples': 1024, 'number_of_characters': 314980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mni_Mtei-mai_Deva': {'num_samples': 1024, 'number_of_characters': 300898, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mni_Mtei-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 335852, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mni_Mtei-mar_Deva': {'num_samples': 1024, 'number_of_characters': 314000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mni_Mtei-npi_Deva': {'num_samples': 1024, 'number_of_characters': 305635, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mni_Mtei-ory_Orya': {'num_samples': 1024, 'number_of_characters': 326442, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mni_Mtei-pan_Guru': {'num_samples': 1024, 'number_of_characters': 299079, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mni_Mtei-san_Deva': {'num_samples': 1024, 'number_of_characters': 310295, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mni_Mtei-sat_Olck': {'num_samples': 1024, 'number_of_characters': 318948, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mni_Mtei-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312637, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mni_Mtei-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340562, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mni_Mtei-tel_Telu': {'num_samples': 1024, 'number_of_characters': 311261, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mni_Mtei-urd_Arab': {'num_samples': 1024, 'number_of_characters': 307350, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'npi_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 313419, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'npi_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 303123, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'npi_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 316110, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'npi_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 311521, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'npi_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 312599, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'npi_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 305095, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'npi_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301941, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'npi_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 312607, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'npi_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 324565, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'npi_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 315265, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'npi_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 301183, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'npi_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 336137, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'npi_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 314285, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'npi_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 305635, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'npi_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 326727, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'npi_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 299364, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'npi_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 310580, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'npi_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 319233, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'npi_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312922, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'npi_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340847, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'npi_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 311546, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'npi_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 307635, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'ory_Orya-asm_Beng': {'num_samples': 1024, 'number_of_characters': 334226, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'ory_Orya-ben_Beng': {'num_samples': 1024, 'number_of_characters': 323930, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'ory_Orya-brx_Deva': {'num_samples': 1024, 'number_of_characters': 336917, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'ory_Orya-doi_Deva': {'num_samples': 1024, 'number_of_characters': 332328, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'ory_Orya-eng_Latn': {'num_samples': 1024, 'number_of_characters': 333406, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'ory_Orya-gom_Deva': {'num_samples': 1024, 'number_of_characters': 325902, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'ory_Orya-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 322748, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'ory_Orya-hin_Deva': {'num_samples': 1024, 'number_of_characters': 333414, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'ory_Orya-kan_Knda': {'num_samples': 1024, 'number_of_characters': 345372, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'ory_Orya-kas_Arab': {'num_samples': 1024, 'number_of_characters': 336072, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'ory_Orya-mai_Deva': {'num_samples': 1024, 'number_of_characters': 321990, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'ory_Orya-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 356944, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'ory_Orya-mar_Deva': {'num_samples': 1024, 'number_of_characters': 335092, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'ory_Orya-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 326442, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'ory_Orya-npi_Deva': {'num_samples': 1024, 'number_of_characters': 326727, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'ory_Orya-pan_Guru': {'num_samples': 1024, 'number_of_characters': 320171, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'ory_Orya-san_Deva': {'num_samples': 1024, 'number_of_characters': 331387, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'ory_Orya-sat_Olck': {'num_samples': 1024, 'number_of_characters': 340040, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'ory_Orya-snd_Deva': {'num_samples': 1024, 'number_of_characters': 333729, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'ory_Orya-tam_Taml': {'num_samples': 1024, 'number_of_characters': 361654, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'ory_Orya-tel_Telu': {'num_samples': 1024, 'number_of_characters': 332353, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'ory_Orya-urd_Arab': {'num_samples': 1024, 'number_of_characters': 328442, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'pan_Guru-asm_Beng': {'num_samples': 1024, 'number_of_characters': 306863, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'pan_Guru-ben_Beng': {'num_samples': 1024, 'number_of_characters': 296567, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'pan_Guru-brx_Deva': {'num_samples': 1024, 'number_of_characters': 309554, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'pan_Guru-doi_Deva': {'num_samples': 1024, 'number_of_characters': 304965, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'pan_Guru-eng_Latn': {'num_samples': 1024, 'number_of_characters': 306043, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'pan_Guru-gom_Deva': {'num_samples': 1024, 'number_of_characters': 298539, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'pan_Guru-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 295385, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'pan_Guru-hin_Deva': {'num_samples': 1024, 'number_of_characters': 306051, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'pan_Guru-kan_Knda': {'num_samples': 1024, 'number_of_characters': 318009, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'pan_Guru-kas_Arab': {'num_samples': 1024, 'number_of_characters': 308709, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'pan_Guru-mai_Deva': {'num_samples': 1024, 'number_of_characters': 294627, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'pan_Guru-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 329581, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'pan_Guru-mar_Deva': {'num_samples': 1024, 'number_of_characters': 307729, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'pan_Guru-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 299079, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'pan_Guru-npi_Deva': {'num_samples': 1024, 'number_of_characters': 299364, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'pan_Guru-ory_Orya': {'num_samples': 1024, 'number_of_characters': 320171, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'pan_Guru-san_Deva': {'num_samples': 1024, 'number_of_characters': 304024, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'pan_Guru-sat_Olck': {'num_samples': 1024, 'number_of_characters': 312677, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'pan_Guru-snd_Deva': {'num_samples': 1024, 'number_of_characters': 306366, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'pan_Guru-tam_Taml': {'num_samples': 1024, 'number_of_characters': 334291, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'pan_Guru-tel_Telu': {'num_samples': 1024, 'number_of_characters': 304990, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'pan_Guru-urd_Arab': {'num_samples': 1024, 'number_of_characters': 301079, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'san_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 318079, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'san_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 307783, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'san_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 320770, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'san_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 316181, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'san_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 317259, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'san_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 309755, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'san_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 306601, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'san_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 317267, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'san_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 329225, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'san_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 319925, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'san_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 305843, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'san_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 340797, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'san_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 318945, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'san_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 310295, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'san_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 310580, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'san_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 331387, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'san_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304024, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'san_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 323893, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'san_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 317582, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'san_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 345507, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'san_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 316206, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'san_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 312295, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'sat_Olck-asm_Beng': {'num_samples': 1024, 'number_of_characters': 326732, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'sat_Olck-ben_Beng': {'num_samples': 1024, 'number_of_characters': 316436, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'sat_Olck-brx_Deva': {'num_samples': 1024, 'number_of_characters': 329423, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'sat_Olck-doi_Deva': {'num_samples': 1024, 'number_of_characters': 324834, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'sat_Olck-eng_Latn': {'num_samples': 1024, 'number_of_characters': 325912, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'sat_Olck-gom_Deva': {'num_samples': 1024, 'number_of_characters': 318408, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'sat_Olck-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 315254, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'sat_Olck-hin_Deva': {'num_samples': 1024, 'number_of_characters': 325920, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'sat_Olck-kan_Knda': {'num_samples': 1024, 'number_of_characters': 337878, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'sat_Olck-kas_Arab': {'num_samples': 1024, 'number_of_characters': 328578, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'sat_Olck-mai_Deva': {'num_samples': 1024, 'number_of_characters': 314496, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'sat_Olck-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 349450, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'sat_Olck-mar_Deva': {'num_samples': 1024, 'number_of_characters': 327598, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'sat_Olck-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 318948, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'sat_Olck-npi_Deva': {'num_samples': 1024, 'number_of_characters': 319233, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'sat_Olck-ory_Orya': {'num_samples': 1024, 'number_of_characters': 340040, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'sat_Olck-pan_Guru': {'num_samples': 1024, 'number_of_characters': 312677, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'sat_Olck-san_Deva': {'num_samples': 1024, 'number_of_characters': 323893, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'sat_Olck-snd_Deva': {'num_samples': 1024, 'number_of_characters': 326235, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'sat_Olck-tam_Taml': {'num_samples': 1024, 'number_of_characters': 354160, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'sat_Olck-tel_Telu': {'num_samples': 1024, 'number_of_characters': 324859, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'sat_Olck-urd_Arab': {'num_samples': 1024, 'number_of_characters': 320948, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'snd_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320421, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'snd_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 310125, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'snd_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 323112, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'snd_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318523, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'snd_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 319601, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'snd_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 312097, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'snd_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308943, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'snd_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 319609, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'snd_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331567, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'snd_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 322267, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'snd_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 308185, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'snd_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 343139, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'snd_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 321287, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'snd_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312637, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'snd_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312922, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'snd_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333729, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'snd_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306366, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'snd_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 317582, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'snd_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 326235, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'snd_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347849, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'snd_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318548, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'snd_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314637, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'tam_Taml-asm_Beng': {'num_samples': 1024, 'number_of_characters': 348346, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'tam_Taml-ben_Beng': {'num_samples': 1024, 'number_of_characters': 338050, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'tam_Taml-brx_Deva': {'num_samples': 1024, 'number_of_characters': 351037, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'tam_Taml-doi_Deva': {'num_samples': 1024, 'number_of_characters': 346448, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'tam_Taml-eng_Latn': {'num_samples': 1024, 'number_of_characters': 347526, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'tam_Taml-gom_Deva': {'num_samples': 1024, 'number_of_characters': 340022, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'tam_Taml-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 336868, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'tam_Taml-hin_Deva': {'num_samples': 1024, 'number_of_characters': 347534, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'tam_Taml-kan_Knda': {'num_samples': 1024, 'number_of_characters': 359492, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'tam_Taml-kas_Arab': {'num_samples': 1024, 'number_of_characters': 350192, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'tam_Taml-mai_Deva': {'num_samples': 1024, 'number_of_characters': 336110, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'tam_Taml-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 371064, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'tam_Taml-mar_Deva': {'num_samples': 1024, 'number_of_characters': 349212, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'tam_Taml-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 340562, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'tam_Taml-npi_Deva': {'num_samples': 1024, 'number_of_characters': 340847, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'tam_Taml-ory_Orya': {'num_samples': 1024, 'number_of_characters': 361654, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'tam_Taml-pan_Guru': {'num_samples': 1024, 'number_of_characters': 334291, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'tam_Taml-san_Deva': {'num_samples': 1024, 'number_of_characters': 345507, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'tam_Taml-sat_Olck': {'num_samples': 1024, 'number_of_characters': 354160, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'tam_Taml-snd_Deva': {'num_samples': 1024, 'number_of_characters': 347849, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'tam_Taml-tel_Telu': {'num_samples': 1024, 'number_of_characters': 346473, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'tam_Taml-urd_Arab': {'num_samples': 1024, 'number_of_characters': 342562, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'tel_Telu-asm_Beng': {'num_samples': 1024, 'number_of_characters': 319045, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'tel_Telu-ben_Beng': {'num_samples': 1024, 'number_of_characters': 308749, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'tel_Telu-brx_Deva': {'num_samples': 1024, 'number_of_characters': 321736, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'tel_Telu-doi_Deva': {'num_samples': 1024, 'number_of_characters': 317147, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'tel_Telu-eng_Latn': {'num_samples': 1024, 'number_of_characters': 318225, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'tel_Telu-gom_Deva': {'num_samples': 1024, 'number_of_characters': 310721, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'tel_Telu-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 307567, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'tel_Telu-hin_Deva': {'num_samples': 1024, 'number_of_characters': 318233, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'tel_Telu-kan_Knda': {'num_samples': 1024, 'number_of_characters': 330191, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'tel_Telu-kas_Arab': {'num_samples': 1024, 'number_of_characters': 320891, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'tel_Telu-mai_Deva': {'num_samples': 1024, 'number_of_characters': 306809, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'tel_Telu-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 341763, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'tel_Telu-mar_Deva': {'num_samples': 1024, 'number_of_characters': 319911, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'tel_Telu-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 311261, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'tel_Telu-npi_Deva': {'num_samples': 1024, 'number_of_characters': 311546, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'tel_Telu-ory_Orya': {'num_samples': 1024, 'number_of_characters': 332353, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'tel_Telu-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304990, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'tel_Telu-san_Deva': {'num_samples': 1024, 'number_of_characters': 316206, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'tel_Telu-sat_Olck': {'num_samples': 1024, 'number_of_characters': 324859, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'tel_Telu-snd_Deva': {'num_samples': 1024, 'number_of_characters': 318548, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'tel_Telu-tam_Taml': {'num_samples': 1024, 'number_of_characters': 346473, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'tel_Telu-urd_Arab': {'num_samples': 1024, 'number_of_characters': 313261, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'urd_Arab-asm_Beng': {'num_samples': 1024, 'number_of_characters': 315134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'urd_Arab-ben_Beng': {'num_samples': 1024, 'number_of_characters': 304838, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'urd_Arab-brx_Deva': {'num_samples': 1024, 'number_of_characters': 317825, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'urd_Arab-doi_Deva': {'num_samples': 1024, 'number_of_characters': 313236, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'urd_Arab-eng_Latn': {'num_samples': 1024, 'number_of_characters': 314314, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'urd_Arab-gom_Deva': {'num_samples': 1024, 'number_of_characters': 306810, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'urd_Arab-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 303656, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'urd_Arab-hin_Deva': {'num_samples': 1024, 'number_of_characters': 314322, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'urd_Arab-kan_Knda': {'num_samples': 1024, 'number_of_characters': 326280, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'urd_Arab-kas_Arab': {'num_samples': 1024, 'number_of_characters': 316980, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'urd_Arab-mai_Deva': {'num_samples': 1024, 'number_of_characters': 302898, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'urd_Arab-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 337852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'urd_Arab-mar_Deva': {'num_samples': 1024, 'number_of_characters': 316000, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'urd_Arab-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 307350, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'urd_Arab-npi_Deva': {'num_samples': 1024, 'number_of_characters': 307635, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'urd_Arab-ory_Orya': {'num_samples': 1024, 'number_of_characters': 328442, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'urd_Arab-pan_Guru': {'num_samples': 1024, 'number_of_characters': 301079, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'urd_Arab-san_Deva': {'num_samples': 1024, 'number_of_characters': 312295, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'urd_Arab-sat_Olck': {'num_samples': 1024, 'number_of_characters': 320948, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'urd_Arab-snd_Deva': {'num_samples': 1024, 'number_of_characters': 314637, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'urd_Arab-tam_Taml': {'num_samples': 1024, 'number_of_characters': 342562, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'urd_Arab-tel_Telu': {'num_samples': 1024, 'number_of_characters': 313261, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}}}} | +| [IWSLT2017BitextMining](https://aclanthology.org/2017.iwslt-1.1/) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'jpn', 'kor', 'nld', 'ron'] | BitextMining | s2s | [Fiction, Non-fiction, Written] | {'validation': 21938} | {'validation': {'num_samples': 21938, 'number_of_characters': 4256244, 'unique_pairs': 21840, 'min_sentence1_length': 2, 'average_sentence1_length': 97.01, 'max_sentence1_length': 521, 'unique_sentence1': 11563, 'min_sentence2_length': 2, 'average_sentence2_length': 97.01, 'max_sentence2_length': 521, 'unique_sentence2': 11563, 'hf_subset_descriptive_stats': {'ar-en': {'num_samples': 888, 'number_of_characters': 172499, 'unique_pairs': 887, 'min_sentence1_length': 4, 'average_sentence1_length': 85.49, 'max_sentence1_length': 369, 'unique_sentence1': 887, 'min_sentence2_length': 10, 'average_sentence2_length': 108.77, 'max_sentence2_length': 462, 'unique_sentence2': 881}, 'de-en': {'num_samples': 888, 'number_of_characters': 202336, 'unique_pairs': 883, 'min_sentence1_length': 6, 'average_sentence1_length': 119.03, 'max_sentence1_length': 521, 'unique_sentence1': 881, 'min_sentence2_length': 10, 'average_sentence2_length': 108.83, 'max_sentence2_length': 462, 'unique_sentence2': 881}, 'en-ar': {'num_samples': 888, 'number_of_characters': 172499, 'unique_pairs': 887, 'min_sentence1_length': 10, 'average_sentence1_length': 108.77, 'max_sentence1_length': 462, 'unique_sentence1': 881, 'min_sentence2_length': 4, 'average_sentence2_length': 85.49, 'max_sentence2_length': 369, 'unique_sentence2': 887}, 'en-de': {'num_samples': 888, 'number_of_characters': 202336, 'unique_pairs': 883, 'min_sentence1_length': 10, 'average_sentence1_length': 108.83, 'max_sentence1_length': 462, 'unique_sentence1': 881, 'min_sentence2_length': 6, 'average_sentence2_length': 119.03, 'max_sentence2_length': 521, 'unique_sentence2': 881}, 'en-fr': {'num_samples': 890, 'number_of_characters': 197619, 'unique_pairs': 883, 'min_sentence1_length': 10, 'average_sentence1_length': 108.41, 'max_sentence1_length': 462, 'unique_sentence1': 883, 'min_sentence2_length': 6, 'average_sentence2_length': 113.63, 'max_sentence2_length': 493, 'unique_sentence2': 881}, 'en-it': {'num_samples': 929, 'number_of_characters': 191803, 'unique_pairs': 924, 'min_sentence1_length': 10, 'average_sentence1_length': 103.0, 'max_sentence1_length': 433, 'unique_sentence1': 922, 'min_sentence2_length': 7, 'average_sentence2_length': 103.46, 'max_sentence2_length': 444, 'unique_sentence2': 918}, 'en-ja': {'num_samples': 871, 'number_of_characters': 132742, 'unique_pairs': 867, 'min_sentence1_length': 10, 'average_sentence1_length': 109.81, 'max_sentence1_length': 462, 'unique_sentence1': 864, 'min_sentence2_length': 5, 'average_sentence2_length': 42.59, 'max_sentence2_length': 225, 'unique_sentence2': 866}, 'en-ko': {'num_samples': 879, 'number_of_characters': 142659, 'unique_pairs': 874, 'min_sentence1_length': 10, 'average_sentence1_length': 107.74, 'max_sentence1_length': 462, 'unique_sentence1': 872, 'min_sentence2_length': 3, 'average_sentence2_length': 54.56, 'max_sentence2_length': 250, 'unique_sentence2': 872}, 'en-nl': {'num_samples': 1003, 'number_of_characters': 189637, 'unique_pairs': 1000, 'min_sentence1_length': 10, 'average_sentence1_length': 95.27, 'max_sentence1_length': 433, 'unique_sentence1': 996, 'min_sentence2_length': 4, 'average_sentence2_length': 93.8, 'max_sentence2_length': 477, 'unique_sentence2': 1000}, 'en-ro': {'num_samples': 914, 'number_of_characters': 194128, 'unique_pairs': 910, 'min_sentence1_length': 10, 'average_sentence1_length': 104.72, 'max_sentence1_length': 433, 'unique_sentence1': 907, 'min_sentence2_length': 9, 'average_sentence2_length': 107.67, 'max_sentence2_length': 448, 'unique_sentence2': 910}, 'en-zh': {'num_samples': 879, 'number_of_characters': 131126, 'unique_pairs': 877, 'min_sentence1_length': 10, 'average_sentence1_length': 109.37, 'max_sentence1_length': 462, 'unique_sentence1': 872, 'min_sentence2_length': 2, 'average_sentence2_length': 39.81, 'max_sentence2_length': 230, 'unique_sentence2': 867}, 'fr-en': {'num_samples': 890, 'number_of_characters': 197619, 'unique_pairs': 883, 'min_sentence1_length': 6, 'average_sentence1_length': 113.63, 'max_sentence1_length': 493, 'unique_sentence1': 881, 'min_sentence2_length': 10, 'average_sentence2_length': 108.41, 'max_sentence2_length': 462, 'unique_sentence2': 883}, 'it-en': {'num_samples': 929, 'number_of_characters': 191803, 'unique_pairs': 924, 'min_sentence1_length': 7, 'average_sentence1_length': 103.46, 'max_sentence1_length': 444, 'unique_sentence1': 918, 'min_sentence2_length': 10, 'average_sentence2_length': 103.0, 'max_sentence2_length': 433, 'unique_sentence2': 922}, 'it-nl': {'num_samples': 1001, 'number_of_characters': 188858, 'unique_pairs': 998, 'min_sentence1_length': 7, 'average_sentence1_length': 94.64, 'max_sentence1_length': 459, 'unique_sentence1': 994, 'min_sentence2_length': 7, 'average_sentence2_length': 94.03, 'max_sentence2_length': 505, 'unique_sentence2': 998}, 'it-ro': {'num_samples': 914, 'number_of_characters': 193339, 'unique_pairs': 911, 'min_sentence1_length': 7, 'average_sentence1_length': 103.91, 'max_sentence1_length': 435, 'unique_sentence1': 907, 'min_sentence2_length': 9, 'average_sentence2_length': 107.62, 'max_sentence2_length': 448, 'unique_sentence2': 910}, 'ja-en': {'num_samples': 871, 'number_of_characters': 132742, 'unique_pairs': 867, 'min_sentence1_length': 5, 'average_sentence1_length': 42.59, 'max_sentence1_length': 225, 'unique_sentence1': 866, 'min_sentence2_length': 10, 'average_sentence2_length': 109.81, 'max_sentence2_length': 462, 'unique_sentence2': 864}, 'ko-en': {'num_samples': 879, 'number_of_characters': 142659, 'unique_pairs': 874, 'min_sentence1_length': 3, 'average_sentence1_length': 54.56, 'max_sentence1_length': 250, 'unique_sentence1': 872, 'min_sentence2_length': 10, 'average_sentence2_length': 107.74, 'max_sentence2_length': 462, 'unique_sentence2': 872}, 'nl-en': {'num_samples': 1003, 'number_of_characters': 189637, 'unique_pairs': 1000, 'min_sentence1_length': 4, 'average_sentence1_length': 93.8, 'max_sentence1_length': 477, 'unique_sentence1': 1000, 'min_sentence2_length': 10, 'average_sentence2_length': 95.27, 'max_sentence2_length': 433, 'unique_sentence2': 996}, 'nl-it': {'num_samples': 1001, 'number_of_characters': 188858, 'unique_pairs': 998, 'min_sentence1_length': 7, 'average_sentence1_length': 94.03, 'max_sentence1_length': 505, 'unique_sentence1': 998, 'min_sentence2_length': 7, 'average_sentence2_length': 94.64, 'max_sentence2_length': 459, 'unique_sentence2': 994}, 'nl-ro': {'num_samples': 913, 'number_of_characters': 191376, 'unique_pairs': 911, 'min_sentence1_length': 7, 'average_sentence1_length': 102.02, 'max_sentence1_length': 478, 'unique_sentence1': 909, 'min_sentence2_length': 9, 'average_sentence2_length': 107.59, 'max_sentence2_length': 515, 'unique_sentence2': 909}, 'ro-en': {'num_samples': 914, 'number_of_characters': 194128, 'unique_pairs': 910, 'min_sentence1_length': 9, 'average_sentence1_length': 107.67, 'max_sentence1_length': 448, 'unique_sentence1': 910, 'min_sentence2_length': 10, 'average_sentence2_length': 104.72, 'max_sentence2_length': 433, 'unique_sentence2': 907}, 'ro-it': {'num_samples': 914, 'number_of_characters': 193339, 'unique_pairs': 911, 'min_sentence1_length': 9, 'average_sentence1_length': 107.62, 'max_sentence1_length': 448, 'unique_sentence1': 910, 'min_sentence2_length': 7, 'average_sentence2_length': 103.91, 'max_sentence2_length': 435, 'unique_sentence2': 907}, 'ro-nl': {'num_samples': 913, 'number_of_characters': 191376, 'unique_pairs': 911, 'min_sentence1_length': 9, 'average_sentence1_length': 107.59, 'max_sentence1_length': 515, 'unique_sentence1': 909, 'min_sentence2_length': 7, 'average_sentence2_length': 102.02, 'max_sentence2_length': 478, 'unique_sentence2': 909}, 'zh-en': {'num_samples': 879, 'number_of_characters': 131126, 'unique_pairs': 877, 'min_sentence1_length': 2, 'average_sentence1_length': 39.81, 'max_sentence1_length': 230, 'unique_sentence1': 867, 'min_sentence2_length': 10, 'average_sentence2_length': 109.37, 'max_sentence2_length': 462, 'unique_sentence2': 872}}}} | +| [ImageCoDeT2IMultiChoice](https://aclanthology.org/2022.acl-long.241.pdf) (Krojer et al., 2022) | ['eng'] | Any2AnyMultiChoice | it2i | [Web, Written] | None | None | +| [ImageCoDeT2IRetrieval](https://aclanthology.org/2022.acl-long.241.pdf) (Krojer et al., 2022) | ['eng'] | Any2AnyRetrieval | t2i | [Web, Written] | None | None | +| [ImageNet10Clustering](https://www.kaggle.com/datasets/liusha249/imagenet10) (Deng et al., 2009) | ['eng'] | ImageClustering | i2t | [Web] | None | None | +| [ImageNetDog15Clustering](http://vision.stanford.edu/aditya86/ImageNetDogs/main.html) (Deng et al., 2009) | ['eng'] | ImageClustering | i2i | [Web] | None | None | +| [Imagenet1k](https://ieeexplore.ieee.org/document/5206848) (Deng et al., 2009) | ['eng'] | ImageClassification | i2i | [Scene] | None | None | +| [Imagenet1kZeroShot](https://ieeexplore.ieee.org/document/5206848) (Deng et al., 2009) | ['eng'] | ZeroShotClassification | i2t | [Scene] | None | None | | [ImdbClassification](http://www.aclweb.org/anthology/P11-1015) | ['eng'] | Classification | p2p | [Reviews, Written] | None | None | -| [InappropriatenessClassification](https://aclanthology.org/2021.bsnlp-1.4) | ['rus'] | Classification | s2s | [Web, Social, Written] | None | None | -| [IndicCrosslingualSTS](https://huggingface.co/datasets/jaygala24/indic_sts) (Ramesh et al., 2022) | ['asm', 'ben', 'eng', 'guj', 'hin', 'kan', 'mal', 'mar', 'ory', 'pan', 'tam', 'tel', 'urd'] | STS | s2s | [News, Non-fiction, Web, Spoken, Government, Written, Spoken] | None | None | -| [IndicGenBenchFloresBitextMining](https://github.com/google-research-datasets/indic-gen-bench/) (Harman Singh, 2024) | ['asm', 'awa', 'ben', 'bgc', 'bho', 'bod', 'boy', 'eng', 'gbm', 'gom', 'guj', 'hin', 'hne', 'kan', 'mai', 'mal', 'mar', 'mni', 'mup', 'mwr', 'nep', 'ory', 'pan', 'pus', 'raj', 'san', 'sat', 'tam', 'tel', 'urd'] | BitextMining | s2s | [Web, News, Written] | {'validation': 57826, 'test': 58696} | {'validation': {'num_samples': 57826, 'number_of_characters': 14600950, 'unique_pairs': 57826, 'min_sentence1_length': 24, 'average_sentence1_length': 126.25, 'max_sentence1_length': 368, 'unique_sentence1': 29903, 'min_sentence2_length': 24, 'average_sentence2_length': 126.24, 'max_sentence2_length': 368, 'unique_sentence2': 29903, 'hf_subset_descriptive_stats': {'ben-eng': {'num_samples': 997, 'number_of_characters': 248469, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 123.65, 'max_sentence1_length': 320, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-ben': {'num_samples': 997, 'number_of_characters': 248469, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 123.65, 'max_sentence2_length': 320, 'unique_sentence2': 997}, 'guj-eng': {'num_samples': 997, 'number_of_characters': 245477, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 120.64, 'max_sentence1_length': 368, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-guj': {'num_samples': 997, 'number_of_characters': 245477, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 120.64, 'max_sentence2_length': 368, 'unique_sentence2': 997}, 'hin-eng': {'num_samples': 997, 'number_of_characters': 250573, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 125.76, 'max_sentence1_length': 355, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-hin': {'num_samples': 997, 'number_of_characters': 250564, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 125.75, 'max_sentence2_length': 355, 'unique_sentence2': 997}, 'kan-eng': {'num_samples': 997, 'number_of_characters': 257131, 'unique_pairs': 997, 'min_sentence1_length': 34, 'average_sentence1_length': 132.33, 'max_sentence1_length': 331, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-kan': {'num_samples': 997, 'number_of_characters': 256986, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 34, 'average_sentence2_length': 132.19, 'max_sentence2_length': 331, 'unique_sentence2': 997}, 'mal-eng': {'num_samples': 997, 'number_of_characters': 267295, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 142.53, 'max_sentence1_length': 360, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mal': {'num_samples': 997, 'number_of_characters': 267296, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 142.53, 'max_sentence2_length': 360, 'unique_sentence2': 997}, 'mar-eng': {'num_samples': 997, 'number_of_characters': 251107, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 126.29, 'max_sentence1_length': 321, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mar': {'num_samples': 997, 'number_of_characters': 250897, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 126.08, 'max_sentence2_length': 321, 'unique_sentence2': 997}, 'tam-eng': {'num_samples': 997, 'number_of_characters': 271322, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 146.57, 'max_sentence1_length': 358, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-tam': {'num_samples': 997, 'number_of_characters': 271322, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 146.57, 'max_sentence2_length': 358, 'unique_sentence2': 997}, 'tel-eng': {'num_samples': 997, 'number_of_characters': 252385, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 127.57, 'max_sentence1_length': 317, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-tel': {'num_samples': 997, 'number_of_characters': 252380, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 127.57, 'max_sentence2_length': 317, 'unique_sentence2': 997}, 'urd-eng': {'num_samples': 997, 'number_of_characters': 249824, 'unique_pairs': 997, 'min_sentence1_length': 37, 'average_sentence1_length': 125.01, 'max_sentence1_length': 295, 'unique_sentence1': 996, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-urd': {'num_samples': 997, 'number_of_characters': 249824, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 37, 'average_sentence2_length': 125.01, 'max_sentence2_length': 295, 'unique_sentence2': 996}, 'asm-eng': {'num_samples': 997, 'number_of_characters': 246220, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 121.39, 'max_sentence1_length': 314, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-asm': {'num_samples': 997, 'number_of_characters': 246224, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 121.39, 'max_sentence2_length': 314, 'unique_sentence2': 997}, 'bho-eng': {'num_samples': 997, 'number_of_characters': 246895, 'unique_pairs': 997, 'min_sentence1_length': 25, 'average_sentence1_length': 122.07, 'max_sentence1_length': 326, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-bho': {'num_samples': 997, 'number_of_characters': 246919, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 25, 'average_sentence2_length': 122.09, 'max_sentence2_length': 326, 'unique_sentence2': 997}, 'nep-eng': {'num_samples': 997, 'number_of_characters': 245984, 'unique_pairs': 997, 'min_sentence1_length': 24, 'average_sentence1_length': 121.15, 'max_sentence1_length': 307, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-nep': {'num_samples': 997, 'number_of_characters': 245984, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 24, 'average_sentence2_length': 121.15, 'max_sentence2_length': 307, 'unique_sentence2': 997}, 'ory-eng': {'num_samples': 997, 'number_of_characters': 254206, 'unique_pairs': 997, 'min_sentence1_length': 34, 'average_sentence1_length': 129.4, 'max_sentence1_length': 308, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-ory': {'num_samples': 997, 'number_of_characters': 254206, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 34, 'average_sentence2_length': 129.4, 'max_sentence2_length': 308, 'unique_sentence2': 997}, 'pan-eng': {'num_samples': 997, 'number_of_characters': 251598, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 126.78, 'max_sentence1_length': 309, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-pan': {'num_samples': 997, 'number_of_characters': 251597, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 126.78, 'max_sentence2_length': 309, 'unique_sentence2': 997}, 'pus-eng': {'num_samples': 997, 'number_of_characters': 247450, 'unique_pairs': 997, 'min_sentence1_length': 32, 'average_sentence1_length': 122.62, 'max_sentence1_length': 300, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-pus': {'num_samples': 997, 'number_of_characters': 247450, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 32, 'average_sentence2_length': 122.62, 'max_sentence2_length': 300, 'unique_sentence2': 997}, 'san-eng': {'num_samples': 997, 'number_of_characters': 249042, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 124.22, 'max_sentence1_length': 311, 'unique_sentence1': 994, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-san': {'num_samples': 997, 'number_of_characters': 248877, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 124.06, 'max_sentence2_length': 311, 'unique_sentence2': 994}, 'awa-eng': {'num_samples': 997, 'number_of_characters': 247944, 'unique_pairs': 997, 'min_sentence1_length': 34, 'average_sentence1_length': 123.12, 'max_sentence1_length': 329, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-awa': {'num_samples': 997, 'number_of_characters': 247884, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 34, 'average_sentence2_length': 123.06, 'max_sentence2_length': 329, 'unique_sentence2': 997}, 'bgc-eng': {'num_samples': 997, 'number_of_characters': 245935, 'unique_pairs': 997, 'min_sentence1_length': 27, 'average_sentence1_length': 121.1, 'max_sentence1_length': 303, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-bgc': {'num_samples': 997, 'number_of_characters': 245935, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 27, 'average_sentence2_length': 121.1, 'max_sentence2_length': 303, 'unique_sentence2': 997}, 'bod-eng': {'num_samples': 997, 'number_of_characters': 266515, 'unique_pairs': 997, 'min_sentence1_length': 26, 'average_sentence1_length': 141.75, 'max_sentence1_length': 355, 'unique_sentence1': 996, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-bod': {'num_samples': 997, 'number_of_characters': 266495, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 26, 'average_sentence2_length': 141.73, 'max_sentence2_length': 355, 'unique_sentence2': 996}, 'boy-eng': {'num_samples': 997, 'number_of_characters': 260174, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 135.39, 'max_sentence1_length': 312, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-boy': {'num_samples': 997, 'number_of_characters': 260174, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 135.39, 'max_sentence2_length': 312, 'unique_sentence2': 997}, 'gbm-eng': {'num_samples': 997, 'number_of_characters': 247009, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 122.18, 'max_sentence1_length': 344, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-gbm': {'num_samples': 997, 'number_of_characters': 247009, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 122.18, 'max_sentence2_length': 344, 'unique_sentence2': 997}, 'gom-eng': {'num_samples': 997, 'number_of_characters': 244553, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 119.72, 'max_sentence1_length': 306, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-gom': {'num_samples': 997, 'number_of_characters': 244553, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 119.72, 'max_sentence2_length': 306, 'unique_sentence2': 997}, 'hne-eng': {'num_samples': 997, 'number_of_characters': 246416, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 121.59, 'max_sentence1_length': 321, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-hne': {'num_samples': 997, 'number_of_characters': 246405, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 121.58, 'max_sentence2_length': 321, 'unique_sentence2': 997}, 'raj-eng': {'num_samples': 997, 'number_of_characters': 249541, 'unique_pairs': 997, 'min_sentence1_length': 32, 'average_sentence1_length': 124.72, 'max_sentence1_length': 313, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-raj': {'num_samples': 997, 'number_of_characters': 249541, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 32, 'average_sentence2_length': 124.72, 'max_sentence2_length': 313, 'unique_sentence2': 997}, 'mai-eng': {'num_samples': 997, 'number_of_characters': 247991, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 123.17, 'max_sentence1_length': 312, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mai': {'num_samples': 997, 'number_of_characters': 247994, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 123.17, 'max_sentence2_length': 312, 'unique_sentence2': 997}, 'mni-eng': {'num_samples': 997, 'number_of_characters': 254308, 'unique_pairs': 997, 'min_sentence1_length': 39, 'average_sentence1_length': 129.5, 'max_sentence1_length': 310, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mni': {'num_samples': 997, 'number_of_characters': 254312, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 39, 'average_sentence2_length': 129.51, 'max_sentence2_length': 310, 'unique_sentence2': 997}, 'mup-eng': {'num_samples': 997, 'number_of_characters': 248486, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 123.66, 'max_sentence1_length': 312, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mup': {'num_samples': 997, 'number_of_characters': 248486, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 123.66, 'max_sentence2_length': 312, 'unique_sentence2': 997}, 'mwr-eng': {'num_samples': 997, 'number_of_characters': 248641, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 123.82, 'max_sentence1_length': 324, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mwr': {'num_samples': 997, 'number_of_characters': 248641, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 123.82, 'max_sentence2_length': 324, 'unique_sentence2': 997}, 'sat-eng': {'num_samples': 997, 'number_of_characters': 258279, 'unique_pairs': 997, 'min_sentence1_length': 37, 'average_sentence1_length': 133.49, 'max_sentence1_length': 333, 'unique_sentence1': 995, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-sat': {'num_samples': 997, 'number_of_characters': 258279, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 37, 'average_sentence2_length': 133.49, 'max_sentence2_length': 333, 'unique_sentence2': 995}}}, 'test': {'num_samples': 58696, 'number_of_characters': 15359416, 'unique_pairs': 58690, 'min_sentence1_length': 33, 'average_sentence1_length': 130.84, 'max_sentence1_length': 431, 'unique_sentence1': 30351, 'min_sentence2_length': 33, 'average_sentence2_length': 130.83, 'max_sentence2_length': 431, 'unique_sentence2': 30351, 'hf_subset_descriptive_stats': {'ben-eng': {'num_samples': 1012, 'number_of_characters': 261008, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 127.51, 'max_sentence1_length': 333, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-ben': {'num_samples': 1012, 'number_of_characters': 261008, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 127.51, 'max_sentence2_length': 333, 'unique_sentence2': 1012}, 'guj-eng': {'num_samples': 1012, 'number_of_characters': 258394, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 124.93, 'max_sentence1_length': 349, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-guj': {'num_samples': 1012, 'number_of_characters': 258394, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 124.93, 'max_sentence2_length': 349, 'unique_sentence2': 1012}, 'hin-eng': {'num_samples': 1012, 'number_of_characters': 263040, 'unique_pairs': 1012, 'min_sentence1_length': 41, 'average_sentence1_length': 129.52, 'max_sentence1_length': 381, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-hin': {'num_samples': 1012, 'number_of_characters': 263029, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 41, 'average_sentence2_length': 129.51, 'max_sentence2_length': 381, 'unique_sentence2': 1012}, 'kan-eng': {'num_samples': 1012, 'number_of_characters': 270091, 'unique_pairs': 1012, 'min_sentence1_length': 43, 'average_sentence1_length': 136.49, 'max_sentence1_length': 388, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-kan': {'num_samples': 1012, 'number_of_characters': 270021, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 43, 'average_sentence2_length': 136.42, 'max_sentence2_length': 388, 'unique_sentence2': 1012}, 'mal-eng': {'num_samples': 1012, 'number_of_characters': 281302, 'unique_pairs': 1012, 'min_sentence1_length': 48, 'average_sentence1_length': 147.57, 'max_sentence1_length': 376, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mal': {'num_samples': 1012, 'number_of_characters': 281302, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 48, 'average_sentence2_length': 147.57, 'max_sentence2_length': 376, 'unique_sentence2': 1012}, 'mar-eng': {'num_samples': 1012, 'number_of_characters': 265212, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 131.67, 'max_sentence1_length': 356, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mar': {'num_samples': 1012, 'number_of_characters': 265023, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 131.48, 'max_sentence2_length': 355, 'unique_sentence2': 1012}, 'tam-eng': {'num_samples': 1012, 'number_of_characters': 286099, 'unique_pairs': 1012, 'min_sentence1_length': 48, 'average_sentence1_length': 152.31, 'max_sentence1_length': 404, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-tam': {'num_samples': 1012, 'number_of_characters': 286099, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 48, 'average_sentence2_length': 152.31, 'max_sentence2_length': 404, 'unique_sentence2': 1012}, 'tel-eng': {'num_samples': 1012, 'number_of_characters': 264460, 'unique_pairs': 1012, 'min_sentence1_length': 39, 'average_sentence1_length': 130.92, 'max_sentence1_length': 359, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-tel': {'num_samples': 1012, 'number_of_characters': 264447, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 39, 'average_sentence2_length': 130.91, 'max_sentence2_length': 359, 'unique_sentence2': 1012}, 'urd-eng': {'num_samples': 1012, 'number_of_characters': 261886, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 128.38, 'max_sentence1_length': 348, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-urd': {'num_samples': 1012, 'number_of_characters': 261885, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 128.38, 'max_sentence2_length': 348, 'unique_sentence2': 1012}, 'asm-eng': {'num_samples': 1012, 'number_of_characters': 257902, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 124.44, 'max_sentence1_length': 329, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-asm': {'num_samples': 1012, 'number_of_characters': 257909, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 124.45, 'max_sentence2_length': 329, 'unique_sentence2': 1012}, 'bho-eng': {'num_samples': 1012, 'number_of_characters': 260578, 'unique_pairs': 1012, 'min_sentence1_length': 36, 'average_sentence1_length': 127.09, 'max_sentence1_length': 367, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-bho': {'num_samples': 1012, 'number_of_characters': 260601, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 36, 'average_sentence2_length': 127.11, 'max_sentence2_length': 367, 'unique_sentence2': 1012}, 'nep-eng': {'num_samples': 1012, 'number_of_characters': 258869, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 125.4, 'max_sentence1_length': 362, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-nep': {'num_samples': 1012, 'number_of_characters': 258869, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 125.4, 'max_sentence2_length': 362, 'unique_sentence2': 1012}, 'ory-eng': {'num_samples': 1012, 'number_of_characters': 266805, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 133.24, 'max_sentence1_length': 354, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-ory': {'num_samples': 1012, 'number_of_characters': 266805, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 133.24, 'max_sentence2_length': 354, 'unique_sentence2': 1012}, 'pan-eng': {'num_samples': 1012, 'number_of_characters': 265391, 'unique_pairs': 1012, 'min_sentence1_length': 37, 'average_sentence1_length': 131.84, 'max_sentence1_length': 380, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-pan': {'num_samples': 1012, 'number_of_characters': 265391, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 37, 'average_sentence2_length': 131.84, 'max_sentence2_length': 380, 'unique_sentence2': 1012}, 'pus-eng': {'num_samples': 1012, 'number_of_characters': 254422, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 121.0, 'max_sentence1_length': 325, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-pus': {'num_samples': 1012, 'number_of_characters': 254421, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 121.0, 'max_sentence2_length': 325, 'unique_sentence2': 1012}, 'san-eng': {'num_samples': 1012, 'number_of_characters': 260339, 'unique_pairs': 1012, 'min_sentence1_length': 33, 'average_sentence1_length': 126.85, 'max_sentence1_length': 358, 'unique_sentence1': 1011, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-san': {'num_samples': 1012, 'number_of_characters': 260224, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 33, 'average_sentence2_length': 126.74, 'max_sentence2_length': 358, 'unique_sentence2': 1011}, 'awa-eng': {'num_samples': 1012, 'number_of_characters': 260179, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 126.69, 'max_sentence1_length': 378, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-awa': {'num_samples': 1012, 'number_of_characters': 260137, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 126.65, 'max_sentence2_length': 378, 'unique_sentence2': 1012}, 'bgc-eng': {'num_samples': 1012, 'number_of_characters': 257450, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 124.0, 'max_sentence1_length': 332, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-bgc': {'num_samples': 1012, 'number_of_characters': 257450, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 124.0, 'max_sentence2_length': 332, 'unique_sentence2': 1012}, 'bod-eng': {'num_samples': 1012, 'number_of_characters': 280188, 'unique_pairs': 1012, 'min_sentence1_length': 42, 'average_sentence1_length': 146.46, 'max_sentence1_length': 431, 'unique_sentence1': 1009, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-bod': {'num_samples': 1012, 'number_of_characters': 280126, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 42, 'average_sentence2_length': 146.4, 'max_sentence2_length': 431, 'unique_sentence2': 1009}, 'boy-eng': {'num_samples': 1012, 'number_of_characters': 277538, 'unique_pairs': 1012, 'min_sentence1_length': 36, 'average_sentence1_length': 143.85, 'max_sentence1_length': 396, 'unique_sentence1': 1011, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-boy': {'num_samples': 1012, 'number_of_characters': 277538, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 36, 'average_sentence2_length': 143.85, 'max_sentence2_length': 396, 'unique_sentence2': 1011}, 'gbm-eng': {'num_samples': 1012, 'number_of_characters': 261027, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 127.53, 'max_sentence1_length': 333, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-gbm': {'num_samples': 1012, 'number_of_characters': 261027, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 127.53, 'max_sentence2_length': 333, 'unique_sentence2': 1012}, 'gom-eng': {'num_samples': 1012, 'number_of_characters': 259182, 'unique_pairs': 1012, 'min_sentence1_length': 37, 'average_sentence1_length': 125.71, 'max_sentence1_length': 335, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-gom': {'num_samples': 1012, 'number_of_characters': 259182, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 37, 'average_sentence2_length': 125.71, 'max_sentence2_length': 335, 'unique_sentence2': 1012}, 'hne-eng': {'num_samples': 1012, 'number_of_characters': 258911, 'unique_pairs': 1012, 'min_sentence1_length': 42, 'average_sentence1_length': 125.44, 'max_sentence1_length': 327, 'unique_sentence1': 1011, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-hne': {'num_samples': 1012, 'number_of_characters': 258915, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 42, 'average_sentence2_length': 125.44, 'max_sentence2_length': 326, 'unique_sentence2': 1011}, 'raj-eng': {'num_samples': 1012, 'number_of_characters': 261987, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 128.48, 'max_sentence1_length': 338, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-raj': {'num_samples': 1012, 'number_of_characters': 261987, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 128.48, 'max_sentence2_length': 338, 'unique_sentence2': 1012}, 'mai-eng': {'num_samples': 1012, 'number_of_characters': 261374, 'unique_pairs': 1012, 'min_sentence1_length': 36, 'average_sentence1_length': 127.87, 'max_sentence1_length': 350, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mai': {'num_samples': 1012, 'number_of_characters': 261377, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 36, 'average_sentence2_length': 127.88, 'max_sentence2_length': 350, 'unique_sentence2': 1012}, 'mni-eng': {'num_samples': 1012, 'number_of_characters': 268767, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 135.18, 'max_sentence1_length': 353, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mni': {'num_samples': 1012, 'number_of_characters': 268768, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 135.18, 'max_sentence2_length': 354, 'unique_sentence2': 1012}, 'mup-eng': {'num_samples': 1012, 'number_of_characters': 262034, 'unique_pairs': 1012, 'min_sentence1_length': 40, 'average_sentence1_length': 128.53, 'max_sentence1_length': 340, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mup': {'num_samples': 1012, 'number_of_characters': 262034, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 40, 'average_sentence2_length': 128.53, 'max_sentence2_length': 340, 'unique_sentence2': 1012}, 'mwr-eng': {'num_samples': 1012, 'number_of_characters': 263749, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.22, 'max_sentence1_length': 345, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mwr': {'num_samples': 1012, 'number_of_characters': 263749, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.22, 'max_sentence2_length': 345, 'unique_sentence2': 1012}, 'sat-eng': {'num_samples': 1012, 'number_of_characters': 271757, 'unique_pairs': 1012, 'min_sentence1_length': 43, 'average_sentence1_length': 138.13, 'max_sentence1_length': 366, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-sat': {'num_samples': 1012, 'number_of_characters': 271757, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 43, 'average_sentence2_length': 138.13, 'max_sentence2_length': 366, 'unique_sentence2': 1012}}}} | -| [IndicLangClassification](https://arxiv.org/abs/2305.15814) | ['asm', 'ben', 'brx', 'doi', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | Classification | s2s | [Web, Non-fiction, Written] | None | None | +| [InappropriatenessClassification](https://aclanthology.org/2021.bsnlp-1.4) | ['rus'] | Classification | s2s | [Social, Web, Written] | None | None | +| [IndicCrosslingualSTS](https://huggingface.co/datasets/jaygala24/indic_sts) (Ramesh et al., 2022) | ['asm', 'ben', 'eng', 'guj', 'hin', 'kan', 'mal', 'mar', 'ory', 'pan', 'tam', 'tel', 'urd'] | STS | s2s | [Government, News, Non-fiction, Spoken, Spoken, Web, Written] | None | None | +| [IndicGenBenchFloresBitextMining](https://github.com/google-research-datasets/indic-gen-bench/) (Harman Singh, 2024) | ['asm', 'awa', 'ben', 'bgc', 'bho', 'bod', 'boy', 'eng', 'gbm', 'gom', 'guj', 'hin', 'hne', 'kan', 'mai', 'mal', 'mar', 'mni', 'mup', 'mwr', 'nep', 'ory', 'pan', 'pus', 'raj', 'san', 'sat', 'tam', 'tel', 'urd'] | BitextMining | s2s | [News, Web, Written] | {'validation': 57826, 'test': 58696} | {'validation': {'num_samples': 57826, 'number_of_characters': 14600950, 'unique_pairs': 57826, 'min_sentence1_length': 24, 'average_sentence1_length': 126.25, 'max_sentence1_length': 368, 'unique_sentence1': 29903, 'min_sentence2_length': 24, 'average_sentence2_length': 126.24, 'max_sentence2_length': 368, 'unique_sentence2': 29903, 'hf_subset_descriptive_stats': {'ben-eng': {'num_samples': 997, 'number_of_characters': 248469, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 123.65, 'max_sentence1_length': 320, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-ben': {'num_samples': 997, 'number_of_characters': 248469, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 123.65, 'max_sentence2_length': 320, 'unique_sentence2': 997}, 'guj-eng': {'num_samples': 997, 'number_of_characters': 245477, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 120.64, 'max_sentence1_length': 368, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-guj': {'num_samples': 997, 'number_of_characters': 245477, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 120.64, 'max_sentence2_length': 368, 'unique_sentence2': 997}, 'hin-eng': {'num_samples': 997, 'number_of_characters': 250573, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 125.76, 'max_sentence1_length': 355, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-hin': {'num_samples': 997, 'number_of_characters': 250564, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 125.75, 'max_sentence2_length': 355, 'unique_sentence2': 997}, 'kan-eng': {'num_samples': 997, 'number_of_characters': 257131, 'unique_pairs': 997, 'min_sentence1_length': 34, 'average_sentence1_length': 132.33, 'max_sentence1_length': 331, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-kan': {'num_samples': 997, 'number_of_characters': 256986, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 34, 'average_sentence2_length': 132.19, 'max_sentence2_length': 331, 'unique_sentence2': 997}, 'mal-eng': {'num_samples': 997, 'number_of_characters': 267295, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 142.53, 'max_sentence1_length': 360, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mal': {'num_samples': 997, 'number_of_characters': 267296, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 142.53, 'max_sentence2_length': 360, 'unique_sentence2': 997}, 'mar-eng': {'num_samples': 997, 'number_of_characters': 251107, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 126.29, 'max_sentence1_length': 321, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mar': {'num_samples': 997, 'number_of_characters': 250897, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 126.08, 'max_sentence2_length': 321, 'unique_sentence2': 997}, 'tam-eng': {'num_samples': 997, 'number_of_characters': 271322, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 146.57, 'max_sentence1_length': 358, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-tam': {'num_samples': 997, 'number_of_characters': 271322, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 146.57, 'max_sentence2_length': 358, 'unique_sentence2': 997}, 'tel-eng': {'num_samples': 997, 'number_of_characters': 252385, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 127.57, 'max_sentence1_length': 317, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-tel': {'num_samples': 997, 'number_of_characters': 252380, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 127.57, 'max_sentence2_length': 317, 'unique_sentence2': 997}, 'urd-eng': {'num_samples': 997, 'number_of_characters': 249824, 'unique_pairs': 997, 'min_sentence1_length': 37, 'average_sentence1_length': 125.01, 'max_sentence1_length': 295, 'unique_sentence1': 996, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-urd': {'num_samples': 997, 'number_of_characters': 249824, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 37, 'average_sentence2_length': 125.01, 'max_sentence2_length': 295, 'unique_sentence2': 996}, 'asm-eng': {'num_samples': 997, 'number_of_characters': 246220, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 121.39, 'max_sentence1_length': 314, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-asm': {'num_samples': 997, 'number_of_characters': 246224, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 121.39, 'max_sentence2_length': 314, 'unique_sentence2': 997}, 'bho-eng': {'num_samples': 997, 'number_of_characters': 246895, 'unique_pairs': 997, 'min_sentence1_length': 25, 'average_sentence1_length': 122.07, 'max_sentence1_length': 326, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-bho': {'num_samples': 997, 'number_of_characters': 246919, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 25, 'average_sentence2_length': 122.09, 'max_sentence2_length': 326, 'unique_sentence2': 997}, 'nep-eng': {'num_samples': 997, 'number_of_characters': 245984, 'unique_pairs': 997, 'min_sentence1_length': 24, 'average_sentence1_length': 121.15, 'max_sentence1_length': 307, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-nep': {'num_samples': 997, 'number_of_characters': 245984, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 24, 'average_sentence2_length': 121.15, 'max_sentence2_length': 307, 'unique_sentence2': 997}, 'ory-eng': {'num_samples': 997, 'number_of_characters': 254206, 'unique_pairs': 997, 'min_sentence1_length': 34, 'average_sentence1_length': 129.4, 'max_sentence1_length': 308, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-ory': {'num_samples': 997, 'number_of_characters': 254206, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 34, 'average_sentence2_length': 129.4, 'max_sentence2_length': 308, 'unique_sentence2': 997}, 'pan-eng': {'num_samples': 997, 'number_of_characters': 251598, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 126.78, 'max_sentence1_length': 309, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-pan': {'num_samples': 997, 'number_of_characters': 251597, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 126.78, 'max_sentence2_length': 309, 'unique_sentence2': 997}, 'pus-eng': {'num_samples': 997, 'number_of_characters': 247450, 'unique_pairs': 997, 'min_sentence1_length': 32, 'average_sentence1_length': 122.62, 'max_sentence1_length': 300, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-pus': {'num_samples': 997, 'number_of_characters': 247450, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 32, 'average_sentence2_length': 122.62, 'max_sentence2_length': 300, 'unique_sentence2': 997}, 'san-eng': {'num_samples': 997, 'number_of_characters': 249042, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 124.22, 'max_sentence1_length': 311, 'unique_sentence1': 994, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-san': {'num_samples': 997, 'number_of_characters': 248877, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 124.06, 'max_sentence2_length': 311, 'unique_sentence2': 994}, 'awa-eng': {'num_samples': 997, 'number_of_characters': 247944, 'unique_pairs': 997, 'min_sentence1_length': 34, 'average_sentence1_length': 123.12, 'max_sentence1_length': 329, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-awa': {'num_samples': 997, 'number_of_characters': 247884, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 34, 'average_sentence2_length': 123.06, 'max_sentence2_length': 329, 'unique_sentence2': 997}, 'bgc-eng': {'num_samples': 997, 'number_of_characters': 245935, 'unique_pairs': 997, 'min_sentence1_length': 27, 'average_sentence1_length': 121.1, 'max_sentence1_length': 303, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-bgc': {'num_samples': 997, 'number_of_characters': 245935, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 27, 'average_sentence2_length': 121.1, 'max_sentence2_length': 303, 'unique_sentence2': 997}, 'bod-eng': {'num_samples': 997, 'number_of_characters': 266515, 'unique_pairs': 997, 'min_sentence1_length': 26, 'average_sentence1_length': 141.75, 'max_sentence1_length': 355, 'unique_sentence1': 996, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-bod': {'num_samples': 997, 'number_of_characters': 266495, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 26, 'average_sentence2_length': 141.73, 'max_sentence2_length': 355, 'unique_sentence2': 996}, 'boy-eng': {'num_samples': 997, 'number_of_characters': 260174, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 135.39, 'max_sentence1_length': 312, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-boy': {'num_samples': 997, 'number_of_characters': 260174, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 135.39, 'max_sentence2_length': 312, 'unique_sentence2': 997}, 'gbm-eng': {'num_samples': 997, 'number_of_characters': 247009, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 122.18, 'max_sentence1_length': 344, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-gbm': {'num_samples': 997, 'number_of_characters': 247009, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 122.18, 'max_sentence2_length': 344, 'unique_sentence2': 997}, 'gom-eng': {'num_samples': 997, 'number_of_characters': 244553, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 119.72, 'max_sentence1_length': 306, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-gom': {'num_samples': 997, 'number_of_characters': 244553, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 119.72, 'max_sentence2_length': 306, 'unique_sentence2': 997}, 'hne-eng': {'num_samples': 997, 'number_of_characters': 246416, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 121.59, 'max_sentence1_length': 321, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-hne': {'num_samples': 997, 'number_of_characters': 246405, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 121.58, 'max_sentence2_length': 321, 'unique_sentence2': 997}, 'raj-eng': {'num_samples': 997, 'number_of_characters': 249541, 'unique_pairs': 997, 'min_sentence1_length': 32, 'average_sentence1_length': 124.72, 'max_sentence1_length': 313, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-raj': {'num_samples': 997, 'number_of_characters': 249541, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 32, 'average_sentence2_length': 124.72, 'max_sentence2_length': 313, 'unique_sentence2': 997}, 'mai-eng': {'num_samples': 997, 'number_of_characters': 247991, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 123.17, 'max_sentence1_length': 312, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mai': {'num_samples': 997, 'number_of_characters': 247994, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 123.17, 'max_sentence2_length': 312, 'unique_sentence2': 997}, 'mni-eng': {'num_samples': 997, 'number_of_characters': 254308, 'unique_pairs': 997, 'min_sentence1_length': 39, 'average_sentence1_length': 129.5, 'max_sentence1_length': 310, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mni': {'num_samples': 997, 'number_of_characters': 254312, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 39, 'average_sentence2_length': 129.51, 'max_sentence2_length': 310, 'unique_sentence2': 997}, 'mup-eng': {'num_samples': 997, 'number_of_characters': 248486, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 123.66, 'max_sentence1_length': 312, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mup': {'num_samples': 997, 'number_of_characters': 248486, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 123.66, 'max_sentence2_length': 312, 'unique_sentence2': 997}, 'mwr-eng': {'num_samples': 997, 'number_of_characters': 248641, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 123.82, 'max_sentence1_length': 324, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mwr': {'num_samples': 997, 'number_of_characters': 248641, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 123.82, 'max_sentence2_length': 324, 'unique_sentence2': 997}, 'sat-eng': {'num_samples': 997, 'number_of_characters': 258279, 'unique_pairs': 997, 'min_sentence1_length': 37, 'average_sentence1_length': 133.49, 'max_sentence1_length': 333, 'unique_sentence1': 995, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-sat': {'num_samples': 997, 'number_of_characters': 258279, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 37, 'average_sentence2_length': 133.49, 'max_sentence2_length': 333, 'unique_sentence2': 995}}}, 'test': {'num_samples': 58696, 'number_of_characters': 15359416, 'unique_pairs': 58690, 'min_sentence1_length': 33, 'average_sentence1_length': 130.84, 'max_sentence1_length': 431, 'unique_sentence1': 30351, 'min_sentence2_length': 33, 'average_sentence2_length': 130.83, 'max_sentence2_length': 431, 'unique_sentence2': 30351, 'hf_subset_descriptive_stats': {'ben-eng': {'num_samples': 1012, 'number_of_characters': 261008, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 127.51, 'max_sentence1_length': 333, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-ben': {'num_samples': 1012, 'number_of_characters': 261008, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 127.51, 'max_sentence2_length': 333, 'unique_sentence2': 1012}, 'guj-eng': {'num_samples': 1012, 'number_of_characters': 258394, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 124.93, 'max_sentence1_length': 349, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-guj': {'num_samples': 1012, 'number_of_characters': 258394, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 124.93, 'max_sentence2_length': 349, 'unique_sentence2': 1012}, 'hin-eng': {'num_samples': 1012, 'number_of_characters': 263040, 'unique_pairs': 1012, 'min_sentence1_length': 41, 'average_sentence1_length': 129.52, 'max_sentence1_length': 381, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-hin': {'num_samples': 1012, 'number_of_characters': 263029, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 41, 'average_sentence2_length': 129.51, 'max_sentence2_length': 381, 'unique_sentence2': 1012}, 'kan-eng': {'num_samples': 1012, 'number_of_characters': 270091, 'unique_pairs': 1012, 'min_sentence1_length': 43, 'average_sentence1_length': 136.49, 'max_sentence1_length': 388, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-kan': {'num_samples': 1012, 'number_of_characters': 270021, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 43, 'average_sentence2_length': 136.42, 'max_sentence2_length': 388, 'unique_sentence2': 1012}, 'mal-eng': {'num_samples': 1012, 'number_of_characters': 281302, 'unique_pairs': 1012, 'min_sentence1_length': 48, 'average_sentence1_length': 147.57, 'max_sentence1_length': 376, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mal': {'num_samples': 1012, 'number_of_characters': 281302, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 48, 'average_sentence2_length': 147.57, 'max_sentence2_length': 376, 'unique_sentence2': 1012}, 'mar-eng': {'num_samples': 1012, 'number_of_characters': 265212, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 131.67, 'max_sentence1_length': 356, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mar': {'num_samples': 1012, 'number_of_characters': 265023, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 131.48, 'max_sentence2_length': 355, 'unique_sentence2': 1012}, 'tam-eng': {'num_samples': 1012, 'number_of_characters': 286099, 'unique_pairs': 1012, 'min_sentence1_length': 48, 'average_sentence1_length': 152.31, 'max_sentence1_length': 404, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-tam': {'num_samples': 1012, 'number_of_characters': 286099, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 48, 'average_sentence2_length': 152.31, 'max_sentence2_length': 404, 'unique_sentence2': 1012}, 'tel-eng': {'num_samples': 1012, 'number_of_characters': 264460, 'unique_pairs': 1012, 'min_sentence1_length': 39, 'average_sentence1_length': 130.92, 'max_sentence1_length': 359, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-tel': {'num_samples': 1012, 'number_of_characters': 264447, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 39, 'average_sentence2_length': 130.91, 'max_sentence2_length': 359, 'unique_sentence2': 1012}, 'urd-eng': {'num_samples': 1012, 'number_of_characters': 261886, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 128.38, 'max_sentence1_length': 348, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-urd': {'num_samples': 1012, 'number_of_characters': 261885, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 128.38, 'max_sentence2_length': 348, 'unique_sentence2': 1012}, 'asm-eng': {'num_samples': 1012, 'number_of_characters': 257902, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 124.44, 'max_sentence1_length': 329, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-asm': {'num_samples': 1012, 'number_of_characters': 257909, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 124.45, 'max_sentence2_length': 329, 'unique_sentence2': 1012}, 'bho-eng': {'num_samples': 1012, 'number_of_characters': 260578, 'unique_pairs': 1012, 'min_sentence1_length': 36, 'average_sentence1_length': 127.09, 'max_sentence1_length': 367, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-bho': {'num_samples': 1012, 'number_of_characters': 260601, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 36, 'average_sentence2_length': 127.11, 'max_sentence2_length': 367, 'unique_sentence2': 1012}, 'nep-eng': {'num_samples': 1012, 'number_of_characters': 258869, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 125.4, 'max_sentence1_length': 362, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-nep': {'num_samples': 1012, 'number_of_characters': 258869, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 125.4, 'max_sentence2_length': 362, 'unique_sentence2': 1012}, 'ory-eng': {'num_samples': 1012, 'number_of_characters': 266805, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 133.24, 'max_sentence1_length': 354, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-ory': {'num_samples': 1012, 'number_of_characters': 266805, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 133.24, 'max_sentence2_length': 354, 'unique_sentence2': 1012}, 'pan-eng': {'num_samples': 1012, 'number_of_characters': 265391, 'unique_pairs': 1012, 'min_sentence1_length': 37, 'average_sentence1_length': 131.84, 'max_sentence1_length': 380, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-pan': {'num_samples': 1012, 'number_of_characters': 265391, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 37, 'average_sentence2_length': 131.84, 'max_sentence2_length': 380, 'unique_sentence2': 1012}, 'pus-eng': {'num_samples': 1012, 'number_of_characters': 254422, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 121.0, 'max_sentence1_length': 325, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-pus': {'num_samples': 1012, 'number_of_characters': 254421, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 121.0, 'max_sentence2_length': 325, 'unique_sentence2': 1012}, 'san-eng': {'num_samples': 1012, 'number_of_characters': 260339, 'unique_pairs': 1012, 'min_sentence1_length': 33, 'average_sentence1_length': 126.85, 'max_sentence1_length': 358, 'unique_sentence1': 1011, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-san': {'num_samples': 1012, 'number_of_characters': 260224, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 33, 'average_sentence2_length': 126.74, 'max_sentence2_length': 358, 'unique_sentence2': 1011}, 'awa-eng': {'num_samples': 1012, 'number_of_characters': 260179, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 126.69, 'max_sentence1_length': 378, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-awa': {'num_samples': 1012, 'number_of_characters': 260137, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 126.65, 'max_sentence2_length': 378, 'unique_sentence2': 1012}, 'bgc-eng': {'num_samples': 1012, 'number_of_characters': 257450, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 124.0, 'max_sentence1_length': 332, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-bgc': {'num_samples': 1012, 'number_of_characters': 257450, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 124.0, 'max_sentence2_length': 332, 'unique_sentence2': 1012}, 'bod-eng': {'num_samples': 1012, 'number_of_characters': 280188, 'unique_pairs': 1012, 'min_sentence1_length': 42, 'average_sentence1_length': 146.46, 'max_sentence1_length': 431, 'unique_sentence1': 1009, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-bod': {'num_samples': 1012, 'number_of_characters': 280126, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 42, 'average_sentence2_length': 146.4, 'max_sentence2_length': 431, 'unique_sentence2': 1009}, 'boy-eng': {'num_samples': 1012, 'number_of_characters': 277538, 'unique_pairs': 1012, 'min_sentence1_length': 36, 'average_sentence1_length': 143.85, 'max_sentence1_length': 396, 'unique_sentence1': 1011, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-boy': {'num_samples': 1012, 'number_of_characters': 277538, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 36, 'average_sentence2_length': 143.85, 'max_sentence2_length': 396, 'unique_sentence2': 1011}, 'gbm-eng': {'num_samples': 1012, 'number_of_characters': 261027, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 127.53, 'max_sentence1_length': 333, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-gbm': {'num_samples': 1012, 'number_of_characters': 261027, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 127.53, 'max_sentence2_length': 333, 'unique_sentence2': 1012}, 'gom-eng': {'num_samples': 1012, 'number_of_characters': 259182, 'unique_pairs': 1012, 'min_sentence1_length': 37, 'average_sentence1_length': 125.71, 'max_sentence1_length': 335, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-gom': {'num_samples': 1012, 'number_of_characters': 259182, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 37, 'average_sentence2_length': 125.71, 'max_sentence2_length': 335, 'unique_sentence2': 1012}, 'hne-eng': {'num_samples': 1012, 'number_of_characters': 258911, 'unique_pairs': 1012, 'min_sentence1_length': 42, 'average_sentence1_length': 125.44, 'max_sentence1_length': 327, 'unique_sentence1': 1011, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-hne': {'num_samples': 1012, 'number_of_characters': 258915, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 42, 'average_sentence2_length': 125.44, 'max_sentence2_length': 326, 'unique_sentence2': 1011}, 'raj-eng': {'num_samples': 1012, 'number_of_characters': 261987, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 128.48, 'max_sentence1_length': 338, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-raj': {'num_samples': 1012, 'number_of_characters': 261987, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 128.48, 'max_sentence2_length': 338, 'unique_sentence2': 1012}, 'mai-eng': {'num_samples': 1012, 'number_of_characters': 261374, 'unique_pairs': 1012, 'min_sentence1_length': 36, 'average_sentence1_length': 127.87, 'max_sentence1_length': 350, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mai': {'num_samples': 1012, 'number_of_characters': 261377, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 36, 'average_sentence2_length': 127.88, 'max_sentence2_length': 350, 'unique_sentence2': 1012}, 'mni-eng': {'num_samples': 1012, 'number_of_characters': 268767, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 135.18, 'max_sentence1_length': 353, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mni': {'num_samples': 1012, 'number_of_characters': 268768, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 135.18, 'max_sentence2_length': 354, 'unique_sentence2': 1012}, 'mup-eng': {'num_samples': 1012, 'number_of_characters': 262034, 'unique_pairs': 1012, 'min_sentence1_length': 40, 'average_sentence1_length': 128.53, 'max_sentence1_length': 340, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mup': {'num_samples': 1012, 'number_of_characters': 262034, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 40, 'average_sentence2_length': 128.53, 'max_sentence2_length': 340, 'unique_sentence2': 1012}, 'mwr-eng': {'num_samples': 1012, 'number_of_characters': 263749, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.22, 'max_sentence1_length': 345, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mwr': {'num_samples': 1012, 'number_of_characters': 263749, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.22, 'max_sentence2_length': 345, 'unique_sentence2': 1012}, 'sat-eng': {'num_samples': 1012, 'number_of_characters': 271757, 'unique_pairs': 1012, 'min_sentence1_length': 43, 'average_sentence1_length': 138.13, 'max_sentence1_length': 366, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-sat': {'num_samples': 1012, 'number_of_characters': 271757, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 43, 'average_sentence2_length': 138.13, 'max_sentence2_length': 366, 'unique_sentence2': 1012}}}} | +| [IndicLangClassification](https://arxiv.org/abs/2305.15814) | ['asm', 'ben', 'brx', 'doi', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | Classification | s2s | [Non-fiction, Web, Written] | None | None | | [IndicNLPNewsClassification](https://github.com/AI4Bharat/indicnlp_corpus#indicnlp-news-article-classification-dataset) (Anoop Kunchukuttan, 2020) | ['guj', 'kan', 'mal', 'mar', 'ori', 'pan', 'tam', 'tel'] | Classification | s2s | [News, Written] | None | None | | [IndicQARetrieval](https://arxiv.org/abs/2212.05409) (Sumanth Doddapaneni, 2022) | ['asm', 'ben', 'guj', 'hin', 'kan', 'mal', 'mar', 'ory', 'pan', 'tam', 'tel'] | Retrieval | s2p | [Web, Written] | None | None | | [IndicReviewsClusteringP2P](https://arxiv.org/abs/2212.05409) (Sumanth Doddapaneni, 2022) | ['asm', 'ben', 'brx', 'guj', 'hin', 'kan', 'mal', 'mar', 'ory', 'pan', 'tam', 'tel', 'urd'] | Clustering | p2p | [Reviews, Written] | None | None | | [IndicSentimentClassification](https://arxiv.org/abs/2212.05409) (Sumanth Doddapaneni, 2022) | ['asm', 'ben', 'brx', 'guj', 'hin', 'kan', 'mal', 'mar', 'ory', 'pan', 'tam', 'tel', 'urd'] | Classification | s2s | [Reviews, Written] | None | None | | [IndonesianIdClickbaitClassification](http://www.sciencedirect.com/science/article/pii/S2352340920311252) | ['ind'] | Classification | s2s | [News, Written] | None | None | | [IndonesianMongabayConservationClassification](https://aclanthology.org/2023.sealp-1.4/) | ['ind'] | Classification | s2s | [Web, Written] | None | None | +| [InfoSeekIT2ITRetrieval](https://aclanthology.org/2023.emnlp-main.925) (Chen et al., 2023) | ['eng'] | Any2AnyRetrieval | it2it | [Encyclopaedic] | None | None | +| [InfoSeekIT2TRetrieval](https://aclanthology.org/2023.emnlp-main.925) (Chen et al., 2023) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | None | None | | [InsurancePolicyInterpretationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [InternationalCitizenshipQuestionsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [IsiZuluNewsClassification](https://huggingface.co/datasets/dsfsi/za-isizulu-siswati-news) (Madodonga et al., 2023) | ['zul'] | Classification | s2s | [News, Written] | None | None | -| [ItaCaseholdClassification](https://doi.org/10.1145/3594536.3595177) (Licari et al., 2023) | ['ita'] | Classification | s2s | [Legal, Government, Written] | None | None | +| [ItaCaseholdClassification](https://doi.org/10.1145/3594536.3595177) (Licari et al., 2023) | ['ita'] | Classification | s2s | [Government, Legal, Written] | None | None | | [Itacola](https://aclanthology.org/2021.findings-emnlp.250/) | ['ita'] | Classification | s2s | [Non-fiction, Spoken, Written] | None | None | | [JCrewBlockerLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [JDReview](https://aclanthology.org/2023.nodalida-1.20/) (Xiao et al., 2023) | ['cmn'] | Classification | s2s | | None | None | -<<<<<<< HEAD -| [JSICK](https://github.com/sbintuitions/JMTEB) (Yanaka et al., 2022) | ['jpn'] | STS | s2s | [Web, Written] | {'test': 1986} | {'test': 21.47} | -| [JSTS](https://aclanthology.org/2022.lrec-1.317.pdf#page=2.00) | ['jpn'] | STS | s2s | [Web, Written] | {'valudtion': 1457} | {'valudtion': 46.34} | -| [JaGovFaqsRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Web, Written] | {'test': 2048} | {'test': {'average_document_length': 210.02601561814512, 'average_query_length': 59.48193359375, 'num_documents': 22794, 'num_queries': 2048, 'average_relevant_docs_per_query': 1.0}} | -| [JaQuADRetrieval](https://arxiv.org/abs/2202.01764) (ByungHoon So, 2022) | ['jpn'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Written] | {'validation': 2048} | {'validation': {'average_document_length': 155.80922362309224, 'average_query_length': 30.826171875, 'num_documents': 3014, 'num_queries': 2048, 'average_relevant_docs_per_query': 2.0}} | -| [JaqketRetrieval](https://github.com/kumapo/JAQKET-dataset) | ['jpn'] | Retrieval | s2p | [Encyclopaedic, Non-fiction, Written] | | | -| [JavaneseIMDBClassification](https://github.com/w11wo/nlp-datasets#javanese-imdb) (Wongso et al., 2021) | ['jav'] | Classification | s2s | [Reviews, Written] | {'test': 25000} | {'test': 481.83} | -| [KLUE-NLI](https://arxiv.org/abs/2105.09680) (Sungjoon Park, 2021) | ['kor'] | PairClassification | s2s | [News, Encyclopaedic, Written] | {'validation': 2000} | {'validation': 35.01} | -| [KLUE-STS](https://arxiv.org/abs/2105.09680) (Sungjoon Park, 2021) | ['kor'] | STS | s2s | [Reviews, News, Spoken, Written, Spoken] | {'validation': 519} | {'validation': 33.178227360308284} | -| [KLUE-TC](https://arxiv.org/abs/2105.09680) (Sungjoon Park, 2021) | ['kor'] | Classification | s2s | [News, Written] | {'validation': 2048} | {'validation': 27.079609091907326} | -| [KannadaNewsClassification](https://github.com/goru001/nlp-for-kannada) (Anoop Kunchukuttan, 2020) | ['kan'] | Classification | s2s | [News, Written] | {'train': 6460} | {'train': 65.88} | -| [KinopoiskClassification](https://www.dialog-21.ru/media/1226/blinovpd.pdf) (Blinov et al., 2013) | ['rus'] | Classification | p2p | [Reviews, Written] | {'test': 1500} | {'test': 1897.3} | -| Ko-StrategyQA (Geva et al., 2021) | ['kor'] | Retrieval | s2p | | None | {'dev': {'average_document_length': 319.25953950924225, 'average_query_length': 22.75337837837838, 'num_documents': 9251, 'num_queries': 592, 'average_relevant_docs_per_query': 1.9341216216216217}} | -| [KorFin](https://huggingface.co/datasets/amphora/korfin-asc) (Son et al., 2023) | ['kor'] | Classification | s2s | [News, Written] | {'test': 2048} | {'test': 75.28} | -| [KorHateClassification](https://paperswithcode.com/dataset/korean-hatespeech-dataset) (Jihyung Moon, 2020) | ['kor'] | Classification | s2s | [Social, Written] | {'train': 2048, 'test': 471} | {'train': 38.57, 'test': 38.86} | -| [KorHateSpeechMLClassification](https://paperswithcode.com/dataset/korean-multi-label-hate-speech-dataset) | ['kor'] | MultilabelClassification | s2s | [Social, Written] | {'train': 8192, 'test': 2048} | {'train': 33.67, 'test': 34.67} | -| [KorSTS](https://arxiv.org/abs/2004.03289) (Ham et al., 2020) | ['kor'] | STS | s2s | [News, Web] | {'test': 1379} | {'test': 29.279433139534884} | -| [KorSarcasmClassification](https://github.com/SpellOnYou/korean-sarcasm) (Kim et al., 2019) | ['kor'] | Classification | s2s | [Social, Written] | {'train': 2048, 'test': 301} | {'train': 48.45, 'test': 46.77} | -| [KurdishSentimentClassification](https://link.springer.com/article/10.1007/s10579-023-09716-6) (Badawi et al., 2024) | ['kur'] | Classification | s2s | [Web, Written] | {'train': 6000, 'test': 1987} | {'train': 59.38, 'test': 56.11} | -| [LCQMC](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None | -| [LEMBNarrativeQARetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) | ['eng'] | Retrieval | s2p | [Fiction, Non-fiction, Written] | {'test': 10804} | {'test': {'average_document_length': 326753.5323943662, 'average_query_length': 47.89453536223562, 'num_documents': 355, 'num_queries': 10449, 'average_relevant_docs_per_query': 1.0}} | -| [LEMBNeedleRetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) (Zhu et al., 2024) | ['eng'] | Retrieval | s2p | [Academic, Blog, Written] | {'test_256': 150, 'test_512': 150, 'test_1024': 150, 'test_2048': 150, 'test_4096': 150, 'test_8192': 150, 'test_16384': 150, 'test_32768': 150} | {'test_256': {'average_document_length': 1013.22, 'average_query_length': 60.48, 'num_documents': 100, 'num_queries': 50, 'average_relevant_docs_per_query': 1.0}, 'test_512': {'average_document_length': 2009.96, 'average_query_length': 57.3, 'num_documents': 100, 'num_queries': 50, 'average_relevant_docs_per_query': 1.0}, 'test_1024': {'average_document_length': 4069.9, 'average_query_length': 58.28, 'num_documents': 100, 'num_queries': 50, 'average_relevant_docs_per_query': 1.0}, 'test_2048': {'average_document_length': 8453.82, 'average_query_length': 59.92, 'num_documents': 100, 'num_queries': 50, 'average_relevant_docs_per_query': 1.0}, 'test_4096': {'average_document_length': 17395.8, 'average_query_length': 55.86, 'num_documents': 100, 'num_queries': 50, 'average_relevant_docs_per_query': 1.0}, 'test_8192': {'average_document_length': 35203.82, 'average_query_length': 59.6, 'num_documents': 100, 'num_queries': 50, 'average_relevant_docs_per_query': 1.0}, 'test_16384': {'average_document_length': 72054.8, 'average_query_length': 59.12, 'num_documents': 100, 'num_queries': 50, 'average_relevant_docs_per_query': 1.0}, 'test_32768': {'average_document_length': 141769.8, 'average_query_length': 58.34, 'num_documents': 100, 'num_queries': 50, 'average_relevant_docs_per_query': 1.0}} | -| [LEMBPasskeyRetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) (Zhu et al., 2024) | ['eng'] | Retrieval | s2p | [Fiction, Written] | {'test_256': 150, 'test_512': 150, 'test_1024': 150, 'test_2048': 150, 'test_4096': 150, 'test_8192': 150, 'test_16384': 150, 'test_32768': 150} | {'test_256': {'average_document_length': 876.24, 'average_query_length': 38.1, 'num_documents': 100, 'num_queries': 50, 'average_relevant_docs_per_query': 1.0}, 'test_512': {'average_document_length': 1785.2, 'average_query_length': 37.76, 'num_documents': 100, 'num_queries': 50, 'average_relevant_docs_per_query': 1.0}, 'test_1024': {'average_document_length': 3607.18, 'average_query_length': 37.68, 'num_documents': 100, 'num_queries': 50, 'average_relevant_docs_per_query': 1.0}, 'test_2048': {'average_document_length': 7242.2, 'average_query_length': 37.8, 'num_documents': 100, 'num_queries': 50, 'average_relevant_docs_per_query': 1.0}, 'test_4096': {'average_document_length': 14518.16, 'average_query_length': 37.64, 'num_documents': 100, 'num_queries': 50, 'average_relevant_docs_per_query': 1.0}, 'test_8192': {'average_document_length': 29071.16, 'average_query_length': 37.54, 'num_documents': 100, 'num_queries': 50, 'average_relevant_docs_per_query': 1.0}, 'test_16384': {'average_document_length': 58175.16, 'average_query_length': 38.12, 'num_documents': 100, 'num_queries': 50, 'average_relevant_docs_per_query': 1.0}, 'test_32768': {'average_document_length': 116380.16, 'average_query_length': 37.74, 'num_documents': 100, 'num_queries': 50, 'average_relevant_docs_per_query': 1.0}} | -| [LEMBQMSumRetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) | ['eng'] | Retrieval | s2p | [Spoken, Written] | {'test': 1724} | {'test': {'average_document_length': 53335.817258883246, 'average_query_length': 433.50294695481335, 'num_documents': 197, 'num_queries': 1527, 'average_relevant_docs_per_query': 1.0}} | -| [LEMBSummScreenFDRetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) | ['eng'] | Retrieval | s2p | [Spoken, Written] | {'validation': 672} | {'validation': {'average_document_length': 30854.32738095238, 'average_query_length': 591.4910714285714, 'num_documents': 336, 'num_queries': 336, 'average_relevant_docs_per_query': 1.0}} | -| [LEMBWikimQARetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) (Ho et al., 2020) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | {'test': 500} | {'test': {'average_document_length': 37445.60333333333, 'average_query_length': 67.57, 'num_documents': 300, 'num_queries': 300, 'average_relevant_docs_per_query': 1.0}} | -| [LanguageClassification](https://huggingface.co/datasets/papluca/language-identification) (Conneau et al., 2018) | ['ara', 'bul', 'cmn', 'deu', 'ell', 'eng', 'fra', 'hin', 'ita', 'jpn', 'nld', 'pol', 'por', 'rus', 'spa', 'swa', 'tha', 'tur', 'urd', 'vie'] | Classification | s2s | [Reviews, Web, Non-fiction, Fiction, Government, Written] | {'test': 2048} | {'test': {'num_samples': 2048, 'average_text_length': 109.546875, 'unique_labels': 20, 'labels': {'17': {'count': 102}, '0': {'count': 102}, '11': {'count': 102}, '4': {'count': 103}, '3': {'count': 102}, '1': {'count': 102}, '10': {'count': 102}, '2': {'count': 103}, '16': {'count': 103}, '9': {'count': 103}, '5': {'count': 102}, '7': {'count': 102}, '13': {'count': 102}, '14': {'count': 103}, '12': {'count': 102}, '15': {'count': 103}, '19': {'count': 102}, '18': {'count': 102}, '6': {'count': 103}, '8': {'count': 103}}}, 'train': {'num_samples': 70000, 'average_text_length': 110.86141428571429, 'unique_labels': 20, 'labels': {'12': {'count': 3500}, '1': {'count': 3500}, '19': {'count': 3500}, '15': {'count': 3500}, '13': {'count': 3500}, '11': {'count': 3500}, '17': {'count': 3500}, '14': {'count': 3500}, '16': {'count': 3500}, '5': {'count': 3500}, '0': {'count': 3500}, '8': {'count': 3500}, '7': {'count': 3500}, '2': {'count': 3500}, '3': {'count': 3500}, '10': {'count': 3500}, '6': {'count': 3500}, '18': {'count': 3500}, '4': {'count': 3500}, '9': {'count': 3500}}}} | -| [LccSentimentClassification](https://github.com/fnielsen/lcc-sentiment) | ['dan'] | Classification | s2s | [News, Web, Written] | {'test': 150} | {'test': 118.7} | -| [LeCaRDv2](https://github.com/THUIR/LeCaRDv2) (Haitao Li, 2023) | ['zho'] | Retrieval | p2p | [Legal, Written] | None | {'test': {'average_document_length': 7232.823978919631, 'average_query_length': 4259.440251572327, 'num_documents': 3795, 'num_queries': 159, 'average_relevant_docs_per_query': 24.50314465408805}} | -| [LearnedHandsBenefitsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 66} | {'test': 1308.44} | -| [LearnedHandsBusinessLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 174} | {'test': 1144.51} | -| [LearnedHandsConsumerLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 614} | {'test': 1277.45} | -| [LearnedHandsCourtsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 192} | {'test': 1171.02} | -| [LearnedHandsCrimeLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 688} | {'test': 1212.9} | -| [LearnedHandsDivorceLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 150} | {'test': 1242.43} | -| [LearnedHandsDomesticViolenceLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 174} | {'test': 1360.83} | -| [LearnedHandsEducationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 56} | {'test': 1397.44} | -| [LearnedHandsEmploymentLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 710} | {'test': 1262.74} | -| [LearnedHandsEstatesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 178} | {'test': 1200.7} | -| [LearnedHandsFamilyLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 2048} | {'test': 1338.27} | -| [LearnedHandsHealthLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 226} | {'test': 1472.59} | -| [LearnedHandsHousingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 2048} | {'test': 1322.54} | -| [LearnedHandsImmigrationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 134} | {'test': 1216.31} | -| [LearnedHandsTortsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 432} | {'test': 1406.97} | -| [LearnedHandsTrafficLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 556} | {'test': 1182.91} | -| [LegalBenchConsumerContractsQA](https://huggingface.co/datasets/nguha/legalbench/viewer/consumer_contracts_qa) (Koreeda et al., 2021) | ['eng'] | Retrieval | s2p | [Legal, Written] | None | {'test': {'average_document_length': 2745.8246753246754, 'average_query_length': 92.4090909090909, 'num_documents': 154, 'num_queries': 396, 'average_relevant_docs_per_query': 1.0}} | -| [LegalBenchCorporateLobbying](https://huggingface.co/datasets/nguha/legalbench/viewer/corporate_lobbying) (Neel Guha, 2023) | ['eng'] | Retrieval | s2p | [Legal, Written] | None | {'test': {'average_document_length': 1157.2225705329154, 'average_query_length': 177.87941176470588, 'num_documents': 319, 'num_queries': 340, 'average_relevant_docs_per_query': 1.0}} | -| [LegalBenchPC](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | PairClassification | s2s | [Legal, Written] | {'test': 2048} | {'test': 287.18} | -| [LegalQuAD](https://github.com/Christoph911/AIKE2021_Appendix) (Hoppe et al., 2021) | ['deu'] | Retrieval | s2p | [Legal, Written] | None | {'test': {'average_document_length': 19481.955, 'average_query_length': 71.965, 'num_documents': 200, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}} | -| [LegalReasoningCausalityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 55} | {'test': 1563.76} | -| [LegalSummarization](https://github.com/lauramanor/legal_summarization) | ['eng'] | Retrieval | s2p | [Legal, Written] | None | {'test': {'average_document_length': 606.1643835616438, 'average_query_length': 103.19014084507042, 'num_documents': 438, 'num_queries': 284, 'average_relevant_docs_per_query': 1.545774647887324}} | -| [LinceMTBitextMining](https://ritual.uh.edu/lince/) (Aguilar et al., 2020) | ['eng', 'hin'] | BitextMining | s2s | [Social, Written] | {'train': 8060} | {'train': 58.67} | -| [LitSearchRetrieval](https://github.com/princeton-nlp/LitSearch) (Ajith et al., 2024) | ['eng'] | Retrieval | s2p | [Academic, Non-fiction, Written] | {'test': 597} | {'test': {'average_document_length': 841.2769, 'average_query_length': 141.2, 'num_documents': 64183, 'num_queries': 597, 'average_relevant_docs_per_query': 1.070351}} | -| [LivedoorNewsClustering.v2](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Clustering | s2s | [News, Written] | {'test': 1106} | {'test': 1082.61} | -| [MAUDLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 2048} | {'test': 1802.93} | -| [MIRACLReranking](https://project-miracl.github.io/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Reranking | s2s | [Encyclopaedic, Written] | {'dev': 44608} | {'dev': 506.3} | -| [MIRACLRetrieval](http://miracl.ai/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | {'dev': {'ar': {'average_document_length': 318.6539598547405, 'average_query_length': 29.480662983425415, 'num_documents': 2061414, 'num_queries': 2896, 'average_relevant_docs_per_query': 1.953729281767956}, 'bn': {'average_document_length': 383.2428136511194, 'average_query_length': 46.98053527980535, 'num_documents': 297265, 'num_queries': 411, 'average_relevant_docs_per_query': 2.099756690997567}, 'de': {'average_document_length': 414.28004442393404, 'average_query_length': 46.0, 'num_documents': 15866222, 'num_queries': 305, 'average_relevant_docs_per_query': 2.6590163934426227}, 'en': {'average_document_length': 401.0042914921588, 'average_query_length': 40.247809762202756, 'num_documents': 32893221, 'num_queries': 799, 'average_relevant_docs_per_query': 2.911138923654568}, 'es': {'average_document_length': 403.71153493754986, 'average_query_length': 47.373456790123456, 'num_documents': 10373953, 'num_queries': 648, 'average_relevant_docs_per_query': 4.609567901234568}, 'fa': {'average_document_length': 262.6478385010321, 'average_query_length': 41.1503164556962, 'num_documents': 2207172, 'num_queries': 632, 'average_relevant_docs_per_query': 2.079113924050633}, 'fi': {'average_document_length': 359.87767671935734, 'average_query_length': 38.63493312352478, 'num_documents': 1883509, 'num_queries': 1271, 'average_relevant_docs_per_query': 1.925255704169945}, 'fr': {'average_document_length': 343.6283550271699, 'average_query_length': 43.883381924198254, 'num_documents': 14636953, 'num_queries': 343, 'average_relevant_docs_per_query': 2.131195335276968}, 'hi': {'average_document_length': 370.96196845914386, 'average_query_length': 53.34, 'num_documents': 506264, 'num_queries': 350, 'average_relevant_docs_per_query': 2.1485714285714286}, 'id': {'average_document_length': 350.2785651811673, 'average_query_length': 37.958333333333336, 'num_documents': 1446315, 'num_queries': 960, 'average_relevant_docs_per_query': 3.216666666666667}, 'ja': {'average_document_length': 145.8538220556965, 'average_query_length': 17.71395348837209, 'num_documents': 6953614, 'num_queries': 860, 'average_relevant_docs_per_query': 2.0813953488372094}, 'ko': {'average_document_length': 173.97649170809927, 'average_query_length': 21.624413145539908, 'num_documents': 1486752, 'num_queries': 213, 'average_relevant_docs_per_query': 2.568075117370892}, 'ru': {'average_document_length': 332.2475377512674, 'average_query_length': 44.13258785942492, 'num_documents': 9543918, 'num_queries': 1252, 'average_relevant_docs_per_query': 2.8434504792332267}, 'sw': {'average_document_length': 228.71348655286377, 'average_query_length': 38.97095435684647, 'num_documents': 131924, 'num_queries': 482, 'average_relevant_docs_per_query': 1.887966804979253}, 'te': {'average_document_length': 396.2108674545774, 'average_query_length': 38.11231884057971, 'num_documents': 518079, 'num_queries': 828, 'average_relevant_docs_per_query': 1.0314009661835748}, 'th': {'average_document_length': 356.8283496198581, 'average_query_length': 42.87585266030014, 'num_documents': 542166, 'num_queries': 733, 'average_relevant_docs_per_query': 1.8321964529331514}, 'yo': {'average_document_length': 159.35250698366738, 'average_query_length': 37.6890756302521, 'num_documents': 49043, 'num_queries': 119, 'average_relevant_docs_per_query': 1.2100840336134453}, 'zh': {'average_document_length': 119.9458931721347, 'average_query_length': 10.867684478371501, 'num_documents': 4934368, 'num_queries': 393, 'average_relevant_docs_per_query': 2.5292620865139948}}} | -| [MIRACLRetrievalHardNegatives](http://miracl.ai/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | {'dev': {'average_document_length': 417.6655323669399, 'average_query_length': 37.46957385337667, 'num_documents': 2449382, 'num_queries': 11076, 'average_relevant_docs_per_query': 2.3643011917659806, 'hf_subset_descriptive_stats': {'ar': {'average_document_length': 438.1872433017704, 'average_query_length': 29.584, 'num_documents': 192103, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.982}, 'bn': {'average_document_length': 383.2428136511194, 'average_query_length': 46.98053527980535, 'num_documents': 297265, 'num_queries': 411, 'average_relevant_docs_per_query': 2.099756690997567}, 'de': {'average_document_length': 513.7796484139344, 'average_query_length': 46.0, 'num_documents': 71277, 'num_queries': 305, 'average_relevant_docs_per_query': 2.6590163934426227}, 'en': {'average_document_length': 529.2486406963214, 'average_query_length': 40.247809762202756, 'num_documents': 178768, 'num_queries': 799, 'average_relevant_docs_per_query': 2.911138923654568}, 'es': {'average_document_length': 535.8023645655877, 'average_query_length': 47.373456790123456, 'num_documents': 146750, 'num_queries': 648, 'average_relevant_docs_per_query': 4.609567901234568}, 'fa': {'average_document_length': 411.2648282882721, 'average_query_length': 41.1503164556962, 'num_documents': 133596, 'num_queries': 632, 'average_relevant_docs_per_query': 2.079113924050633}, 'fi': {'average_document_length': 462.9445310289844, 'average_query_length': 38.646, 'num_documents': 194415, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.918}, 'fr': {'average_document_length': 460.40909271865917, 'average_query_length': 43.883381924198254, 'num_documents': 75357, 'num_queries': 343, 'average_relevant_docs_per_query': 2.131195335276968}, 'hi': {'average_document_length': 498.6759426632417, 'average_query_length': 53.34, 'num_documents': 63066, 'num_queries': 350, 'average_relevant_docs_per_query': 2.1485714285714286}, 'id': {'average_document_length': 494.1689807519638, 'average_query_length': 37.958333333333336, 'num_documents': 168173, 'num_queries': 960, 'average_relevant_docs_per_query': 3.216666666666667}, 'ja': {'average_document_length': 206.13654293407583, 'average_query_length': 17.71395348837209, 'num_documents': 185319, 'num_queries': 860, 'average_relevant_docs_per_query': 2.0813953488372094}, 'ko': {'average_document_length': 257.82646155267594, 'average_query_length': 21.624413145539908, 'num_documents': 43293, 'num_queries': 213, 'average_relevant_docs_per_query': 2.568075117370892}, 'ru': {'average_document_length': 476.0820349224605, 'average_query_length': 44.055, 'num_documents': 219114, 'num_queries': 1000, 'average_relevant_docs_per_query': 2.833}, 'sw': {'average_document_length': 228.71348655286377, 'average_query_length': 38.97095435684647, 'num_documents': 131924, 'num_queries': 482, 'average_relevant_docs_per_query': 1.887966804979253}, 'te': {'average_document_length': 601.7099283059209, 'average_query_length': 38.11231884057971, 'num_documents': 101961, 'num_queries': 828, 'average_relevant_docs_per_query': 1.0314009661835748}, 'th': {'average_document_length': 478.8818849711528, 'average_query_length': 42.87585266030014, 'num_documents': 116649, 'num_queries': 733, 'average_relevant_docs_per_query': 1.8321964529331514}, 'yo': {'average_document_length': 159.35250698366738, 'average_query_length': 37.6890756302521, 'num_documents': 49043, 'num_queries': 119, 'average_relevant_docs_per_query': 1.2100840336134453}, 'zh': {'average_document_length': 147.36211243527777, 'average_query_length': 10.867684478371501, 'num_documents': 81309, 'num_queries': 393, 'average_relevant_docs_per_query': 2.5292620865139948}}}} | -| [MLQARetrieval](https://huggingface.co/datasets/mlqa) | ['ara', 'deu', 'eng', 'hin', 'spa', 'vie', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | {'test': 158083, 'validation': 15747} | {'validation': {'ara-ara': {'average_document_length': 693.8883826879271, 'average_query_length': 42.321083172147, 'num_documents': 439, 'num_queries': 517, 'average_relevant_docs_per_query': 1.0}, 'ara-deu': {'average_document_length': 759.3882352941176, 'average_query_length': 55.14492753623188, 'num_documents': 170, 'num_queries': 207, 'average_relevant_docs_per_query': 1.0}, 'ara-eng': {'average_document_length': 693.8883826879271, 'average_query_length': 50.029013539651835, 'num_documents': 439, 'num_queries': 517, 'average_relevant_docs_per_query': 1.0}, 'ara-spa': {'average_document_length': 654.3071428571428, 'average_query_length': 53.68944099378882, 'num_documents': 140, 'num_queries': 161, 'average_relevant_docs_per_query': 1.0}, 'ara-hin': {'average_document_length': 626.5935483870968, 'average_query_length': 51.956989247311824, 'num_documents': 155, 'num_queries': 186, 'average_relevant_docs_per_query': 1.0}, 'ara-vie': {'average_document_length': 804.6216216216217, 'average_query_length': 49.57055214723926, 'num_documents': 148, 'num_queries': 163, 'average_relevant_docs_per_query': 1.0}, 'ara-zho': {'average_document_length': 787.3161290322581, 'average_query_length': 15.617021276595745, 'num_documents': 155, 'num_queries': 188, 'average_relevant_docs_per_query': 1.0}, 'deu-ara': {'average_document_length': 702.1675977653631, 'average_query_length': 43.06280193236715, 'num_documents': 179, 'num_queries': 207, 'average_relevant_docs_per_query': 1.0}, 'deu-deu': {'average_document_length': 721.405701754386, 'average_query_length': 52.572265625, 'num_documents': 456, 'num_queries': 512, 'average_relevant_docs_per_query': 1.0}, 'deu-eng': {'average_document_length': 721.405701754386, 'average_query_length': 48.33984375, 'num_documents': 456, 'num_queries': 512, 'average_relevant_docs_per_query': 1.0}, 'deu-spa': {'average_document_length': 677.2762430939226, 'average_query_length': 50.60204081632653, 'num_documents': 181, 'num_queries': 196, 'average_relevant_docs_per_query': 1.0}, 'deu-hin': {'average_document_length': 685.917808219178, 'average_query_length': 47.01840490797546, 'num_documents': 146, 'num_queries': 163, 'average_relevant_docs_per_query': 1.0}, 'deu-vie': {'average_document_length': 921.6196319018405, 'average_query_length': 46.81868131868132, 'num_documents': 163, 'num_queries': 182, 'average_relevant_docs_per_query': 1.0}, 'deu-zho': {'average_document_length': 736.6347305389221, 'average_query_length': 14.936842105263159, 'num_documents': 167, 'num_queries': 190, 'average_relevant_docs_per_query': 1.0}, 'eng-ara': {'average_document_length': 979.3447488584475, 'average_query_length': 42.321083172147, 'num_documents': 438, 'num_queries': 517, 'average_relevant_docs_per_query': 1.0}, 'eng-deu': {'average_document_length': 947.3109619686801, 'average_query_length': 52.572265625, 'num_documents': 447, 'num_queries': 512, 'average_relevant_docs_per_query': 1.0}, 'eng-eng': {'average_document_length': 940.2842535787321, 'average_query_length': 49.01480836236934, 'num_documents': 978, 'num_queries': 1148, 'average_relevant_docs_per_query': 1.0}, 'eng-spa': {'average_document_length': 904.3166287015945, 'average_query_length': 52.146, 'num_documents': 439, 'num_queries': 500, 'average_relevant_docs_per_query': 1.0}, 'eng-hin': {'average_document_length': 926.9621749408983, 'average_query_length': 49.3905325443787, 'num_documents': 423, 'num_queries': 507, 'average_relevant_docs_per_query': 1.0}, 'eng-vie': {'average_document_length': 1011.8296460176991, 'average_query_length': 48.082191780821915, 'num_documents': 452, 'num_queries': 511, 'average_relevant_docs_per_query': 1.0}, 'eng-zho': {'average_document_length': 1001.5046511627907, 'average_query_length': 15.39484126984127, 'num_documents': 430, 'num_queries': 504, 'average_relevant_docs_per_query': 1.0}, 'spa-ara': {'average_document_length': 674.3586206896551, 'average_query_length': 41.36024844720497, 'num_documents': 145, 'num_queries': 161, 'average_relevant_docs_per_query': 1.0}, 'spa-deu': {'average_document_length': 544.0489130434783, 'average_query_length': 51.86734693877551, 'num_documents': 184, 'num_queries': 196, 'average_relevant_docs_per_query': 1.0}, 'spa-eng': {'average_document_length': 641.8215859030837, 'average_query_length': 49.156, 'num_documents': 454, 'num_queries': 500, 'average_relevant_docs_per_query': 1.0}, 'spa-spa': {'average_document_length': 641.8215859030837, 'average_query_length': 52.146, 'num_documents': 454, 'num_queries': 500, 'average_relevant_docs_per_query': 1.0}, 'spa-hin': {'average_document_length': 703.3212121212122, 'average_query_length': 48.080213903743314, 'num_documents': 165, 'num_queries': 187, 'average_relevant_docs_per_query': 1.0}, 'spa-vie': {'average_document_length': 737.8579545454545, 'average_query_length': 48.82539682539682, 'num_documents': 176, 'num_queries': 189, 'average_relevant_docs_per_query': 1.0}, 'spa-zho': {'average_document_length': 605.52, 'average_query_length': 15.590062111801242, 'num_documents': 150, 'num_queries': 161, 'average_relevant_docs_per_query': 1.0}, 'hin-ara': {'average_document_length': 670.0394736842105, 'average_query_length': 43.623655913978496, 'num_documents': 152, 'num_queries': 186, 'average_relevant_docs_per_query': 1.0}, 'hin-deu': {'average_document_length': 596.9718309859155, 'average_query_length': 51.41717791411043, 'num_documents': 142, 'num_queries': 163, 'average_relevant_docs_per_query': 1.0}, 'hin-eng': {'average_document_length': 691.5482352941176, 'average_query_length': 49.75936883629191, 'num_documents': 425, 'num_queries': 507, 'average_relevant_docs_per_query': 1.0}, 'hin-spa': {'average_document_length': 718.4904458598726, 'average_query_length': 52.75935828877005, 'num_documents': 157, 'num_queries': 187, 'average_relevant_docs_per_query': 1.0}, 'hin-hin': {'average_document_length': 691.5482352941176, 'average_query_length': 49.3905325443787, 'num_documents': 425, 'num_queries': 507, 'average_relevant_docs_per_query': 1.0}, 'hin-vie': {'average_document_length': 778.484076433121, 'average_query_length': 48.35028248587571, 'num_documents': 157, 'num_queries': 177, 'average_relevant_docs_per_query': 1.0}, 'hin-zho': {'average_document_length': 685.0679012345679, 'average_query_length': 15.97883597883598, 'num_documents': 162, 'num_queries': 189, 'average_relevant_docs_per_query': 1.0}, 'vie-ara': {'average_document_length': 886.6052631578947, 'average_query_length': 41.214723926380366, 'num_documents': 152, 'num_queries': 163, 'average_relevant_docs_per_query': 1.0}, 'vie-deu': {'average_document_length': 981.4534161490683, 'average_query_length': 51.27472527472528, 'num_documents': 161, 'num_queries': 182, 'average_relevant_docs_per_query': 1.0}, 'vie-eng': {'average_document_length': 892.7250554323725, 'average_query_length': 48.09001956947162, 'num_documents': 451, 'num_queries': 511, 'average_relevant_docs_per_query': 1.0}, 'vie-spa': {'average_document_length': 936.6746987951807, 'average_query_length': 51.851851851851855, 'num_documents': 166, 'num_queries': 189, 'average_relevant_docs_per_query': 1.0}, 'vie-hin': {'average_document_length': 869.0509554140127, 'average_query_length': 46.44632768361582, 'num_documents': 157, 'num_queries': 177, 'average_relevant_docs_per_query': 1.0}, 'vie-vie': {'average_document_length': 892.7250554323725, 'average_query_length': 48.082191780821915, 'num_documents': 451, 'num_queries': 511, 'average_relevant_docs_per_query': 1.0}, 'vie-zho': {'average_document_length': 960.7349397590361, 'average_query_length': 15.048913043478262, 'num_documents': 166, 'num_queries': 184, 'average_relevant_docs_per_query': 1.0}, 'zho-ara': {'average_document_length': 238.75155279503105, 'average_query_length': 44.34574468085106, 'num_documents': 161, 'num_queries': 188, 'average_relevant_docs_per_query': 1.0}, 'zho-deu': {'average_document_length': 257.109756097561, 'average_query_length': 53.84736842105263, 'num_documents': 164, 'num_queries': 190, 'average_relevant_docs_per_query': 1.0}, 'zho-eng': {'average_document_length': 246.65237020316027, 'average_query_length': 50.15079365079365, 'num_documents': 443, 'num_queries': 504, 'average_relevant_docs_per_query': 1.0}, 'zho-spa': {'average_document_length': 249.6081081081081, 'average_query_length': 52.857142857142854, 'num_documents': 148, 'num_queries': 161, 'average_relevant_docs_per_query': 1.0}, 'zho-hin': {'average_document_length': 238.5521472392638, 'average_query_length': 52.05291005291005, 'num_documents': 163, 'num_queries': 189, 'average_relevant_docs_per_query': 1.0}, 'zho-vie': {'average_document_length': 268.32142857142856, 'average_query_length': 49.33695652173913, 'num_documents': 168, 'num_queries': 184, 'average_relevant_docs_per_query': 1.0}, 'zho-zho': {'average_document_length': 246.65237020316027, 'average_query_length': 15.39484126984127, 'num_documents': 443, 'num_queries': 504, 'average_relevant_docs_per_query': 1.0}}, 'test': {'ara-ara': {'average_document_length': 698.5714593198451, 'average_query_length': 41.26176636039752, 'num_documents': 4646, 'num_queries': 5333, 'average_relevant_docs_per_query': 1.000375023438965}, 'ara-deu': {'average_document_length': 592.5728542914171, 'average_query_length': 51.27730582524272, 'num_documents': 1503, 'num_queries': 1648, 'average_relevant_docs_per_query': 1.0006067961165048}, 'ara-eng': {'average_document_length': 698.5714593198451, 'average_query_length': 48.556451612903224, 'num_documents': 4646, 'num_queries': 5332, 'average_relevant_docs_per_query': 1.000562640660165}, 'ara-spa': {'average_document_length': 713.4833239118146, 'average_query_length': 51.406471183013146, 'num_documents': 1769, 'num_queries': 1978, 'average_relevant_docs_per_query': 1.0}, 'ara-hin': {'average_document_length': 702.1388888888889, 'average_query_length': 48.71818678317859, 'num_documents': 1512, 'num_queries': 1831, 'average_relevant_docs_per_query': 1.0}, 'ara-vie': {'average_document_length': 745.4528096017458, 'average_query_length': 48.815828041035665, 'num_documents': 1833, 'num_queries': 2047, 'average_relevant_docs_per_query': 1.0}, 'ara-zho': {'average_document_length': 774.4593639575971, 'average_query_length': 14.985355648535565, 'num_documents': 1698, 'num_queries': 1912, 'average_relevant_docs_per_query': 1.0}, 'deu-ara': {'average_document_length': 719.6800267201069, 'average_query_length': 39.54578532443905, 'num_documents': 1497, 'num_queries': 1649, 'average_relevant_docs_per_query': 1.0}, 'deu-deu': {'average_document_length': 725.5304712558599, 'average_query_length': 51.610680257035234, 'num_documents': 4053, 'num_queries': 4513, 'average_relevant_docs_per_query': 1.0008863283846665}, 'deu-eng': {'average_document_length': 725.5304712558599, 'average_query_length': 47.07777531575449, 'num_documents': 4053, 'num_queries': 4513, 'average_relevant_docs_per_query': 1.0008863283846665}, 'deu-spa': {'average_document_length': 740.5414052697616, 'average_query_length': 50.098591549295776, 'num_documents': 1594, 'num_queries': 1775, 'average_relevant_docs_per_query': 1.0005633802816902}, 'deu-hin': {'average_document_length': 674.3714063714064, 'average_query_length': 45.146153846153844, 'num_documents': 1287, 'num_queries': 1430, 'average_relevant_docs_per_query': 1.0}, 'deu-vie': {'average_document_length': 760.1198945981555, 'average_query_length': 46.64358208955224, 'num_documents': 1518, 'num_queries': 1675, 'average_relevant_docs_per_query': 1.0}, 'deu-zho': {'average_document_length': 771.3367697594501, 'average_query_length': 14.942592592592593, 'num_documents': 1455, 'num_queries': 1620, 'average_relevant_docs_per_query': 1.0006172839506173}, 'eng-ara': {'average_document_length': 1008.3584455058619, 'average_query_length': 41.26176636039752, 'num_documents': 4606, 'num_queries': 5333, 'average_relevant_docs_per_query': 1.000375023438965}, 'eng-deu': {'average_document_length': 910.3226686507936, 'average_query_length': 51.610680257035234, 'num_documents': 4032, 'num_queries': 4513, 'average_relevant_docs_per_query': 1.0008863283846665}, 'eng-eng': {'average_document_length': 983.0993344090359, 'average_query_length': 47.960714902434816, 'num_documents': 9916, 'num_queries': 11582, 'average_relevant_docs_per_query': 1.000690726990157}, 'eng-spa': {'average_document_length': 967.4622376109068, 'average_query_length': 50.923252713768804, 'num_documents': 4621, 'num_queries': 5251, 'average_relevant_docs_per_query': 1.000380879832413}, 'eng-hin': {'average_document_length': 986.0465631929046, 'average_query_length': 47.328315703824245, 'num_documents': 4059, 'num_queries': 4916, 'average_relevant_docs_per_query': 1.000406834825061}, 'eng-vie': {'average_document_length': 1048.6062197940744, 'average_query_length': 48.094085532302095, 'num_documents': 4759, 'num_queries': 5495, 'average_relevant_docs_per_query': 1.0}, 'eng-zho': {'average_document_length': 1063.8536257833482, 'average_query_length': 15.019080996884735, 'num_documents': 4468, 'num_queries': 5136, 'average_relevant_docs_per_query': 1.0001947040498442}, 'spa-ara': {'average_document_length': 645.5182320441988, 'average_query_length': 40.78412537917088, 'num_documents': 1810, 'num_queries': 1978, 'average_relevant_docs_per_query': 1.0}, 'spa-deu': {'average_document_length': 586.6057810578105, 'average_query_length': 51.870913190529876, 'num_documents': 1626, 'num_queries': 1774, 'average_relevant_docs_per_query': 1.0011273957158964}, 'spa-eng': {'average_document_length': 630.6735979836169, 'average_query_length': 47.827907862173994, 'num_documents': 4761, 'num_queries': 5253, 'average_relevant_docs_per_query': 1.0}, 'spa-spa': {'average_document_length': 630.6735979836169, 'average_query_length': 50.923252713768804, 'num_documents': 4761, 'num_queries': 5251, 'average_relevant_docs_per_query': 1.000380879832413}, 'spa-hin': {'average_document_length': 613.3478260869565, 'average_query_length': 46.36680208937899, 'num_documents': 1518, 'num_queries': 1723, 'average_relevant_docs_per_query': 1.0}, 'spa-vie': {'average_document_length': 659.6179295624333, 'average_query_length': 48.1595639246779, 'num_documents': 1874, 'num_queries': 2018, 'average_relevant_docs_per_query': 1.0}, 'spa-zho': {'average_document_length': 668.6646171045277, 'average_query_length': 15.115562403697997, 'num_documents': 1789, 'num_queries': 1947, 'average_relevant_docs_per_query': 1.0}, 'hin-ara': {'average_document_length': 765.0352862849534, 'average_query_length': 42.04642271982523, 'num_documents': 1502, 'num_queries': 1831, 'average_relevant_docs_per_query': 1.0}, 'hin-deu': {'average_document_length': 719.676862745098, 'average_query_length': 51.002799160251925, 'num_documents': 1275, 'num_queries': 1429, 'average_relevant_docs_per_query': 1.000699790062981}, 'hin-eng': {'average_document_length': 760.9956086850451, 'average_query_length': 47.91232709519935, 'num_documents': 4099, 'num_queries': 4916, 'average_relevant_docs_per_query': 1.000406834825061}, 'hin-spa': {'average_document_length': 753.5010281014394, 'average_query_length': 50.46689895470383, 'num_documents': 1459, 'num_queries': 1722, 'average_relevant_docs_per_query': 1.0005807200929153}, 'hin-hin': {'average_document_length': 760.9956086850451, 'average_query_length': 47.328315703824245, 'num_documents': 4099, 'num_queries': 4916, 'average_relevant_docs_per_query': 1.000406834825061}, 'hin-vie': {'average_document_length': 789.9253822629969, 'average_query_length': 48.21160760143811, 'num_documents': 1635, 'num_queries': 1947, 'average_relevant_docs_per_query': 1.0}, 'hin-zho': {'average_document_length': 834.2057448229793, 'average_query_length': 15.101301641199774, 'num_documents': 1497, 'num_queries': 1767, 'average_relevant_docs_per_query': 1.0}, 'vie-ara': {'average_document_length': 992.2129527991218, 'average_query_length': 41.82462139716659, 'num_documents': 1822, 'num_queries': 2047, 'average_relevant_docs_per_query': 1.0}, 'vie-deu': {'average_document_length': 861.0610079575597, 'average_query_length': 51.58721624850657, 'num_documents': 1508, 'num_queries': 1674, 'average_relevant_docs_per_query': 1.0005973715651135}, 'vie-eng': {'average_document_length': 913.8633993743483, 'average_query_length': 48.11086837793555, 'num_documents': 4795, 'num_queries': 5493, 'average_relevant_docs_per_query': 1.0003640997633352}, 'vie-spa': {'average_document_length': 940.0322580645161, 'average_query_length': 51.13386217154189, 'num_documents': 1829, 'num_queries': 2017, 'average_relevant_docs_per_query': 1.0004957858205255}, 'vie-hin': {'average_document_length': 838.1713414634146, 'average_query_length': 47.484334874165384, 'num_documents': 1640, 'num_queries': 1947, 'average_relevant_docs_per_query': 1.0}, 'vie-vie': {'average_document_length': 913.8633993743483, 'average_query_length': 48.094085532302095, 'num_documents': 4795, 'num_queries': 5495, 'average_relevant_docs_per_query': 1.0}, 'vie-zho': {'average_document_length': 999.064534883721, 'average_query_length': 15.045805455481215, 'num_documents': 1720, 'num_queries': 1943, 'average_relevant_docs_per_query': 1.0}, 'zho-ara': {'average_document_length': 253.71303841676368, 'average_query_length': 42.04866562009419, 'num_documents': 1718, 'num_queries': 1911, 'average_relevant_docs_per_query': 1.000523286237572}, 'zho-deu': {'average_document_length': 241.84631147540983, 'average_query_length': 52.25107958050586, 'num_documents': 1464, 'num_queries': 1621, 'average_relevant_docs_per_query': 1.0}, 'zho-eng': {'average_document_length': 247.55609326880776, 'average_query_length': 48.64167478091529, 'num_documents': 4546, 'num_queries': 5135, 'average_relevant_docs_per_query': 1.0003894839337877}, 'zho-spa': {'average_document_length': 254.44552196235026, 'average_query_length': 51.90446841294299, 'num_documents': 1753, 'num_queries': 1947, 'average_relevant_docs_per_query': 1.0}, 'zho-hin': {'average_document_length': 229.60590163934427, 'average_query_length': 49.06625141562854, 'num_documents': 1525, 'num_queries': 1766, 'average_relevant_docs_per_query': 1.0005662514156286}, 'zho-vie': {'average_document_length': 266.1140401146132, 'average_query_length': 49.27328872876994, 'num_documents': 1745, 'num_queries': 1943, 'average_relevant_docs_per_query': 1.0}, 'zho-zho': {'average_document_length': 247.55609326880776, 'average_query_length': 15.019080996884735, 'num_documents': 4546, 'num_queries': 5136, 'average_relevant_docs_per_query': 1.0001947040498442}}} | -| [MLQuestions](https://github.com/McGill-NLP/MLQuestions) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Academic, Written] | {'dev': 1500, 'test': 1500} | {'dev': {'average_document_length': 258.8772727272727, 'average_query_length': 45.05533333333333, 'num_documents': 11000, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'test': {'average_document_length': 258.8772727272727, 'average_query_length': 45.75333333333333, 'num_documents': 11000, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}} | -| [MLSUMClusteringP2P.v2](https://huggingface.co/datasets/mteb/mlsum) (Scialom et al., 2020) | ['deu', 'fra', 'rus', 'spa'] | Clustering | p2p | [News, Written] | {'validation': 2048, 'test': 2048} | {'validation': 4613, 'test': 4810} | -| [MLSUMClusteringS2S.v2](https://huggingface.co/datasets/mteb/mlsum) (Scialom et al., 2020) | ['deu', 'fra', 'rus', 'spa'] | Clustering | s2s | [News, Written] | {'validation': 750, 'test': 756} | {'validation': 4613, 'test': 4810} | -| [MMarcoReranking](https://github.com/unicamp-dl/mMARCO) (Luiz Henrique Bonifacio, 2021) | ['cmn'] | Reranking | s2s | | None | None | -| [MMarcoRetrieval](https://arxiv.org/abs/2309.07597) (Shitao Xiao, 2024) | ['cmn'] | Retrieval | s2p | | None | {'dev': {'average_document_length': 114.41787048392986, 'average_query_length': 10.51131805157593, 'num_documents': 106813, 'num_queries': 6980, 'average_relevant_docs_per_query': 1.0654727793696275}} | -| [MSMARCO](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | | None | {'train': {'average_document_length': 335.79716603691344, 'average_query_length': 33.21898281898998, 'num_documents': 8841823, 'num_queries': 502939, 'average_relevant_docs_per_query': 1.0592755781516248}, 'dev': {'average_document_length': 335.79716603691344, 'average_query_length': 33.2621776504298, 'num_documents': 8841823, 'num_queries': 6980, 'average_relevant_docs_per_query': 1.0654727793696275}, 'test': {'average_document_length': 335.79716603691344, 'average_query_length': 32.74418604651163, 'num_documents': 8841823, 'num_queries': 43, 'average_relevant_docs_per_query': 95.3953488372093}} | -| [MSMARCO-PL](https://microsoft.github.io/msmarco/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | {'test': {'average_document_length': 349.3574939240471, 'average_query_length': 33.02325581395349, 'num_documents': 8841823, 'num_queries': 43, 'average_relevant_docs_per_query': 95.3953488372093}} | -| [MSMARCO-PLHardNegatives](https://microsoft.github.io/msmarco/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | {'test': 43} | {'test': {'average_document_length': 382.3476426537285, 'average_query_length': 33.02325581395349, 'num_documents': 9481, 'num_queries': 43, 'average_relevant_docs_per_query': 95.3953488372093}} | -| [MSMARCOHardNegatives](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | | {'test': 43} | {'test': {'average_document_length': 355.2909668633681, 'average_query_length': 32.74418604651163, 'num_documents': 8812, 'num_queries': 43, 'average_relevant_docs_per_query': 95.3953488372093}} | -======= | [JSICK](https://github.com/sbintuitions/JMTEB) (Yanaka et al., 2022) | ['jpn'] | STS | s2s | [Web, Written] | None | None | | [JSTS](https://aclanthology.org/2022.lrec-1.317.pdf#page=2.00) | ['jpn'] | STS | s2s | [Web, Written] | None | None | | [JaGovFaqsRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Web, Written] | None | None | | [JaQuADRetrieval](https://arxiv.org/abs/2202.01764) (ByungHoon So, 2022) | ['jpn'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Written] | None | None | | [JaqketRetrieval](https://github.com/kumapo/JAQKET-dataset) | ['jpn'] | Retrieval | s2p | [Encyclopaedic, Non-fiction, Written] | {'test': 115226} | {'test': {'number_of_characters': 428294530, 'num_samples': 115226, 'num_queries': 997, 'num_documents': 114229, 'min_document_length': 16, 'average_document_length': 0.44, 'max_document_length': 98, 'unique_documents': 114229, 'min_query_length': 8, 'average_query_length': 429532.57, 'max_query_length': 188424, 'unique_queries': 997, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 989}} | | [JavaneseIMDBClassification](https://github.com/w11wo/nlp-datasets#javanese-imdb) (Wongso et al., 2021) | ['jav'] | Classification | s2s | [Reviews, Written] | None | None | -| [KLUE-NLI](https://arxiv.org/abs/2105.09680) (Sungjoon Park, 2021) | ['kor'] | PairClassification | s2s | [News, Encyclopaedic, Written] | None | None | -| [KLUE-STS](https://arxiv.org/abs/2105.09680) (Sungjoon Park, 2021) | ['kor'] | STS | s2s | [Reviews, News, Spoken, Written, Spoken] | None | None | +| [KLUE-NLI](https://arxiv.org/abs/2105.09680) (Sungjoon Park, 2021) | ['kor'] | PairClassification | s2s | [Encyclopaedic, News, Written] | None | None | +| [KLUE-STS](https://arxiv.org/abs/2105.09680) (Sungjoon Park, 2021) | ['kor'] | STS | s2s | [News, Reviews, Spoken, Spoken, Written] | None | None | | [KLUE-TC](https://arxiv.org/abs/2105.09680) (Sungjoon Park, 2021) | ['kor'] | Classification | s2s | [News, Written] | None | None | | [KannadaNewsClassification](https://github.com/goru001/nlp-for-kannada) (Anoop Kunchukuttan, 2020) | ['kan'] | Classification | s2s | [News, Written] | None | None | | [KinopoiskClassification](https://www.dialog-21.ru/media/1226/blinovpd.pdf) (Blinov et al., 2013) | ['rus'] | Classification | p2p | [Reviews, Written] | None | None | | Ko-StrategyQA (Geva et al., 2021) | ['kor'] | Retrieval | s2p | | None | None | -| [KorFin](https://huggingface.co/datasets/amphora/korfin-asc) (Son et al., 2023) | ['kor'] | Classification | s2s | [News, Written] | None | None | +| [KorFin](https://huggingface.co/datasets/amphora/korfin-asc) (Son et al., 2023) | ['kor'] | Classification | s2s | [Financial, News, Written] | None | None | | [KorHateClassification](https://paperswithcode.com/dataset/korean-hatespeech-dataset) (Jihyung Moon, 2020) | ['kor'] | Classification | s2s | [Social, Written] | None | None | | [KorHateSpeechMLClassification](https://paperswithcode.com/dataset/korean-multi-label-hate-speech-dataset) | ['kor'] | MultilabelClassification | s2s | [Social, Written] | None | None | | [KorSTS](https://arxiv.org/abs/2004.03289) (Ham et al., 2020) | ['kor'] | STS | s2s | [News, Web] | None | None | @@ -548,7 +371,8 @@ The following tables give you an overview of the tasks in MTEB. | [LEMBQMSumRetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) | ['eng'] | Retrieval | s2p | [Spoken, Written] | None | None | | [LEMBSummScreenFDRetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) | ['eng'] | Retrieval | s2p | [Spoken, Written] | None | None | | [LEMBWikimQARetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) (Ho et al., 2020) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | -| [LanguageClassification](https://huggingface.co/datasets/papluca/language-identification) (Conneau et al., 2018) | ['ara', 'bul', 'cmn', 'deu', 'ell', 'eng', 'fra', 'hin', 'ita', 'jpn', 'nld', 'pol', 'por', 'rus', 'spa', 'swa', 'tha', 'tur', 'urd', 'vie'] | Classification | s2s | [Reviews, Web, Non-fiction, Fiction, Government, Written] | {'test': 2048, 'train': 70000} | {'test': {'num_samples': 2048, 'number_of_characters': 224352, 'num_texts_in_train': 31, 'min_text_length': 14, 'average_text_length': 109.55, 'max_text_length': 1270, 'unique_text': 2025, 'unique_labels': 20, 'labels': {'17': {'count': 102}, '0': {'count': 102}, '11': {'count': 102}, '4': {'count': 103}, '3': {'count': 102}, '1': {'count': 102}, '10': {'count': 102}, '2': {'count': 103}, '16': {'count': 103}, '9': {'count': 103}, '5': {'count': 102}, '7': {'count': 102}, '13': {'count': 102}, '14': {'count': 103}, '12': {'count': 102}, '15': {'count': 103}, '19': {'count': 102}, '18': {'count': 102}, '6': {'count': 103}, '8': {'count': 103}}}, 'train': {'num_samples': 70000, 'number_of_characters': 7760299, 'num_texts_in_train': None, 'min_text_length': 2, 'average_text_length': 110.86, 'max_text_length': 2422, 'unique_text': 68978, 'unique_labels': 20, 'labels': {'12': {'count': 3500}, '1': {'count': 3500}, '19': {'count': 3500}, '15': {'count': 3500}, '13': {'count': 3500}, '11': {'count': 3500}, '17': {'count': 3500}, '14': {'count': 3500}, '16': {'count': 3500}, '5': {'count': 3500}, '0': {'count': 3500}, '8': {'count': 3500}, '7': {'count': 3500}, '2': {'count': 3500}, '3': {'count': 3500}, '10': {'count': 3500}, '6': {'count': 3500}, '18': {'count': 3500}, '4': {'count': 3500}, '9': {'count': 3500}}}} | +| [LLaVAIT2TRetrieval](https://github.com/LinWeizheDragon/FLMR/blob/main/docs/Datasets.md) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | None | None | +| [LanguageClassification](https://huggingface.co/datasets/papluca/language-identification) (Conneau et al., 2018) | ['ara', 'bul', 'cmn', 'deu', 'ell', 'eng', 'fra', 'hin', 'ita', 'jpn', 'nld', 'pol', 'por', 'rus', 'spa', 'swa', 'tha', 'tur', 'urd', 'vie'] | Classification | s2s | [Fiction, Government, Non-fiction, Reviews, Web, Written] | {'test': 2048, 'train': 70000} | {'test': {'num_samples': 2048, 'number_of_characters': 224352, 'num_texts_in_train': 31, 'min_text_length': 14, 'average_text_length': 109.55, 'max_text_length': 1270, 'unique_text': 2025, 'unique_labels': 20, 'labels': {'17': {'count': 102}, '0': {'count': 102}, '11': {'count': 102}, '4': {'count': 103}, '3': {'count': 102}, '1': {'count': 102}, '10': {'count': 102}, '2': {'count': 103}, '16': {'count': 103}, '9': {'count': 103}, '5': {'count': 102}, '7': {'count': 102}, '13': {'count': 102}, '14': {'count': 103}, '12': {'count': 102}, '15': {'count': 103}, '19': {'count': 102}, '18': {'count': 102}, '6': {'count': 103}, '8': {'count': 103}}}, 'train': {'num_samples': 70000, 'number_of_characters': 7760299, 'num_texts_in_train': None, 'min_text_length': 2, 'average_text_length': 110.86, 'max_text_length': 2422, 'unique_text': 68978, 'unique_labels': 20, 'labels': {'12': {'count': 3500}, '1': {'count': 3500}, '19': {'count': 3500}, '15': {'count': 3500}, '13': {'count': 3500}, '11': {'count': 3500}, '17': {'count': 3500}, '14': {'count': 3500}, '16': {'count': 3500}, '5': {'count': 3500}, '0': {'count': 3500}, '8': {'count': 3500}, '7': {'count': 3500}, '2': {'count': 3500}, '3': {'count': 3500}, '10': {'count': 3500}, '6': {'count': 3500}, '18': {'count': 3500}, '4': {'count': 3500}, '9': {'count': 3500}}}} | | [LccSentimentClassification](https://github.com/fnielsen/lcc-sentiment) | ['dan'] | Classification | s2s | [News, Web, Written] | None | None | | [LeCaRDv2](https://github.com/THUIR/LeCaRDv2) (Haitao Li, 2023) | ['zho'] | Retrieval | p2p | [Legal, Written] | None | None | | [LearnedHandsBenefitsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | @@ -577,21 +401,26 @@ The following tables give you an overview of the tasks in MTEB. | [LitSearchRetrieval](https://github.com/princeton-nlp/LitSearch) (Ajith et al., 2024) | ['eng'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | | [LivedoorNewsClustering.v2](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Clustering | s2s | [News, Written] | None | None | | [MAUDLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [METI2IRetrieval](https://arxiv.org/abs/2202.01747) (Ypsilantis et al., 2021) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | None | None | | [MIRACLReranking](https://project-miracl.github.io/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Reranking | s2s | [Encyclopaedic, Written] | None | None | | [MIRACLRetrieval](http://miracl.ai/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [MIRACLRetrievalHardNegatives](http://miracl.ai/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [MLQARetrieval](https://huggingface.co/datasets/mlqa) | ['ara', 'deu', 'eng', 'hin', 'spa', 'vie', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | -| [MLQuestions](https://github.com/McGill-NLP/MLQuestions) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Academic, Written] | None | None | +| [MLQuestions](https://github.com/McGill-NLP/MLQuestions) | ['eng'] | Retrieval | s2p | [Academic, Encyclopaedic, Written] | None | None | | [MLSUMClusteringP2P.v2](https://huggingface.co/datasets/mteb/mlsum) (Scialom et al., 2020) | ['deu', 'fra', 'rus', 'spa'] | Clustering | p2p | [News, Written] | None | None | | [MLSUMClusteringS2S.v2](https://huggingface.co/datasets/mteb/mlsum) (Scialom et al., 2020) | ['deu', 'fra', 'rus', 'spa'] | Clustering | s2s | [News, Written] | None | None | | [MMarcoReranking](https://github.com/unicamp-dl/mMARCO) (Luiz Henrique Bonifacio, 2021) | ['cmn'] | Reranking | s2s | | None | None | | [MMarcoRetrieval](https://arxiv.org/abs/2309.07597) (Shitao Xiao, 2024) | ['cmn'] | Retrieval | s2p | | None | None | -| [MSMARCO](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | | None | None | +| [MNIST](https://en.wikipedia.org/wiki/MNIST_database) (LeCun et al., 2010) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [MNISTZeroShot](https://en.wikipedia.org/wiki/MNIST_database) (LeCun et al., 2010) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | +| [MSCOCOI2TRetrieval](https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48) (Lin et al., 2014) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | None | None | +| [MSCOCOT2IRetrieval](https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48) (Lin et al., 2014) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | None | None | +| [MSMARCO](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Academic, Blog, Encyclopaedic, Government, Medical, News, Non-fiction, Reviews, Social, Web] | None | None | +| [MSMARCO-Fa](https://huggingface.co/datasets/MCINext/msmarco-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [MSMARCO-PL](https://microsoft.github.io/msmarco/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | None | | [MSMARCO-PLHardNegatives](https://microsoft.github.io/msmarco/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | None | -| [MSMARCOHardNegatives](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | | None | None | ->>>>>>> main -| [MSMARCOv2](https://microsoft.github.io/msmarco/TREC-Deep-Learning.html) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | | None | None | +| [MSMARCOHardNegatives](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Academic, Blog, Encyclopaedic, Government, Medical, News, Non-fiction, Reviews, Social, Web] | None | None | +| [MSMARCOv2](https://microsoft.github.io/msmarco/TREC-Deep-Learning.html) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Academic, Blog, Encyclopaedic, Government, Medical, News, Non-fiction, Reviews, Social, Web] | None | None | | [MTOPDomainClassification](https://arxiv.org/pdf/2008.09335.pdf) | ['deu', 'eng', 'fra', 'hin', 'spa', 'tha'] | Classification | s2s | [Spoken, Spoken] | None | None | | [MTOPIntentClassification](https://arxiv.org/pdf/2008.09335.pdf) | ['deu', 'eng', 'fra', 'hin', 'spa', 'tha'] | Classification | s2s | [Spoken, Spoken] | None | None | | [MacedonianTweetSentimentClassification](https://aclanthology.org/R15-1034/) | ['mkd'] | Classification | s2s | [Social, Written] | None | None | @@ -599,92 +428,39 @@ The following tables give you an overview of the tasks in MTEB. | [MalteseNewsClassification](https://huggingface.co/datasets/MLRS/maltese_news_categories) | ['mlt'] | MultilabelClassification | s2s | [Constructed, Written] | None | None | | [MarathiNewsClassification](https://github.com/goru001/nlp-for-marathi) (Anoop Kunchukuttan, 2020) | ['mar'] | Classification | s2s | [News, Written] | None | None | | [MasakhaNEWSClassification](https://arxiv.org/abs/2304.09972) (David Ifeoluwa Adelani, 2023) | ['amh', 'eng', 'fra', 'hau', 'ibo', 'lin', 'lug', 'orm', 'pcm', 'run', 'sna', 'som', 'swa', 'tir', 'xho', 'yor'] | Classification | s2s | [News, Written] | None | None | -| [MasakhaNEWSClusteringP2P](https://huggingface.co/datasets/masakhane/masakhanews) (David Ifeoluwa Adelani, 2023) | ['amh', 'eng', 'fra', 'hau', 'ibo', 'lin', 'lug', 'orm', 'pcm', 'run', 'sna', 'som', 'swa', 'tir', 'xho', 'yor'] | Clustering | p2p | [News, Written, Non-fiction] | None | None | +| [MasakhaNEWSClusteringP2P](https://huggingface.co/datasets/masakhane/masakhanews) (David Ifeoluwa Adelani, 2023) | ['amh', 'eng', 'fra', 'hau', 'ibo', 'lin', 'lug', 'orm', 'pcm', 'run', 'sna', 'som', 'swa', 'tir', 'xho', 'yor'] | Clustering | p2p | [News, Non-fiction, Written] | None | None | | [MasakhaNEWSClusteringS2S](https://huggingface.co/datasets/masakhane/masakhanews) (David Ifeoluwa Adelani, 2023) | ['amh', 'eng', 'fra', 'hau', 'ibo', 'lin', 'lug', 'orm', 'pcm', 'run', 'sna', 'som', 'swa', 'tir', 'xho', 'yor'] | Clustering | s2s | | None | None | -<<<<<<< HEAD -| [MassiveIntentClassification](https://arxiv.org/abs/2204.08582) (Jack FitzGerald, 2022) | ['afr', 'amh', 'ara', 'aze', 'ben', 'cmo', 'cym', 'dan', 'deu', 'ell', 'eng', 'fas', 'fin', 'fra', 'heb', 'hin', 'hun', 'hye', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kan', 'kat', 'khm', 'kor', 'lav', 'mal', 'mon', 'msa', 'mya', 'nld', 'nob', 'pol', 'por', 'ron', 'rus', 'slv', 'spa', 'sqi', 'swa', 'swe', 'tam', 'tel', 'tgl', 'tha', 'tur', 'urd', 'vie'] | Classification | s2s | [Spoken] | {'validation': 2033, 'test': 2974} | {'validation': 34.8, 'test': 34.6} | -| [MassiveScenarioClassification](https://arxiv.org/abs/2204.08582) (Jack FitzGerald, 2022) | ['afr', 'amh', 'ara', 'aze', 'ben', 'cmo', 'cym', 'dan', 'deu', 'ell', 'eng', 'fas', 'fin', 'fra', 'heb', 'hin', 'hun', 'hye', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kan', 'kat', 'khm', 'kor', 'lav', 'mal', 'mon', 'msa', 'mya', 'nld', 'nob', 'pol', 'por', 'ron', 'rus', 'slv', 'spa', 'sqi', 'swa', 'swe', 'tam', 'tel', 'tgl', 'tha', 'tur', 'urd', 'vie'] | Classification | s2s | [Spoken] | {'validation': 2033, 'test': 2974} | {'validation': 34.8, 'test': 34.6} | -| [MedicalQARetrieval](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-3119-4) (Asma et al., 2019) | ['eng'] | Retrieval | s2s | [Medical, Written] | {'test': 2048} | {'test': {'average_document_length': 1153.482421875, 'average_query_length': 52.4794921875, 'num_documents': 2048, 'num_queries': 2048, 'average_relevant_docs_per_query': 1.0}} | -| [MedicalRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | {'dev': {'average_document_length': 122.04231725066585, 'average_query_length': 17.938, 'num_documents': 100999, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}} | -| [MedrxivClusteringP2P.v2](https://api.medrxiv.org/) | ['eng'] | Clustering | p2p | [Academic, Medical, Written] | {'test': 1500} | {'test': 1984.7} | -| [MedrxivClusteringS2S.v2](https://api.medrxiv.org/) | ['eng'] | Clustering | s2s | [Academic, Medical, Written] | {'test': 1500} | {'test': 114.9} | -| [MewsC16JaClustering](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Clustering | s2s | [News, Written] | {'test': 992} | {'test': 95} | -| [MindSmallReranking](https://msnews.github.io/assets/doc/ACL2020_MIND.pdf) | ['eng'] | Reranking | s2s | [News, Written] | {'test': 107968} | {'test': 70.9} | -| MintakaRetrieval | ['ara', 'deu', 'fra', 'hin', 'ita', 'jpn', 'por', 'spa'] | Retrieval | s2p | [Encyclopaedic, Written] | None | {'test': {'ar': {'average_document_length': 12.736418511066399, 'average_query_length': 55.275533363595095, 'num_documents': 1491, 'num_queries': 2203, 'average_relevant_docs_per_query': 1.0}, 'de': {'average_document_length': 14.40060422960725, 'average_query_length': 65.41322662173546, 'num_documents': 1655, 'num_queries': 2374, 'average_relevant_docs_per_query': 1.0}, 'es': {'average_document_length': 14.291789722386296, 'average_query_length': 64.88325082508251, 'num_documents': 1693, 'num_queries': 2424, 'average_relevant_docs_per_query': 1.0}, 'fr': {'average_document_length': 14.407234539089849, 'average_query_length': 68.88452088452088, 'num_documents': 1714, 'num_queries': 2442, 'average_relevant_docs_per_query': 1.0}, 'hi': {'average_document_length': 12.71038961038961, 'average_query_length': 58.404637247569184, 'num_documents': 770, 'num_queries': 1337, 'average_relevant_docs_per_query': 1.0}, 'it': {'average_document_length': 14.365985576923077, 'average_query_length': 64.39707724425887, 'num_documents': 1664, 'num_queries': 2395, 'average_relevant_docs_per_query': 1.0004175365344468}, 'ja': {'average_document_length': 9.167713567839195, 'average_query_length': 29.961937716262977, 'num_documents': 1592, 'num_queries': 2312, 'average_relevant_docs_per_query': 1.0}, 'pt': {'average_document_length': 14.244471744471744, 'average_query_length': 60.42225998300765, 'num_documents': 1628, 'num_queries': 2354, 'average_relevant_docs_per_query': 1.0004248088360237}}} | -| [Moroco](https://huggingface.co/datasets/moroco) (Andrei M. Butnaru, 2019) | ['ron'] | Classification | s2s | [News, Written] | {'test': 2048} | {'test': 1710.94} | -| [MovieReviewSentimentClassification](https://github.com/TheophileBlard/french-sentiment-analysis-with-bert) (Thรฉophile Blard, 2020) | ['fra'] | Classification | s2s | [Reviews, Written] | {'validation': 1024, 'test': 1024} | {'validation': 550.3, 'test': 558.1} | -| [MrTidyRetrieval](https://huggingface.co/datasets/castorini/mr-tydi) (Xinyu Zhang, 2021) | ['ara', 'ben', 'eng', 'fin', 'ind', 'jpn', 'kor', 'rus', 'swa', 'tel', 'tha'] | Retrieval | s2p | [Encyclopaedic, Written] | | | -| [MultiEURLEXMultilabelClassification](https://huggingface.co/datasets/coastalcph/multi_eurlex) (Chalkidis et al., 2021) | ['bul', 'ces', 'dan', 'deu', 'ell', 'eng', 'est', 'fin', 'fra', 'hrv', 'hun', 'ita', 'lav', 'lit', 'mlt', 'nld', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe'] | MultilabelClassification | p2p | [Legal, Government, Written] | {'test': 5000} | {'test': {'average_text_length': 12014.408930434782, 'average_label_per_text': 3.5938, 'num_samples': 115000, 'unique_labels': 21, 'labels': {'18': {'count': 50784}, '15': {'count': 30981}, '5': {'count': 24978}, '6': {'count': 45080}, '3': {'count': 63687}, '17': {'count': 37743}, '1': {'count': 15019}, '20': {'count': 14030}, '0': {'count': 17802}, '2': {'count': 22402}, '19': {'count': 10212}, '9': {'count': 3772}, '4': {'count': 9062}, '10': {'count': 7705}, '11': {'count': 12213}, '7': {'count': 14306}, '12': {'count': 11799}, '8': {'count': 13800}, '13': {'count': 2346}, '14': {'count': 4255}, '16': {'count': 1311}}, 'hf_subset_descriptive_stats': {'en': {'average_text_length': 11720.2926, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'de': {'average_text_length': 12865.4162, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'fr': {'average_text_length': 13081.1098, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'it': {'average_text_length': 12763.4786, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'es': {'average_text_length': 13080.29, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'pl': {'average_text_length': 12282.5926, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'ro': {'average_text_length': 12836.9322, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'nl': {'average_text_length': 12857.9742, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'el': {'average_text_length': 12998.143, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'hu': {'average_text_length': 12424.641, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'pt': {'average_text_length': 12482.4616, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'cs': {'average_text_length': 10783.4676, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'sv': {'average_text_length': 11612.4774, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'bg': {'average_text_length': 12235.4268, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'da': {'average_text_length': 11773.958, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'fi': {'average_text_length': 12087.6862, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'sk': {'average_text_length': 11130.814, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'lt': {'average_text_length': 11245.3566, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'hr': {'average_text_length': 11022.142, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'sl': {'average_text_length': 10620.0594, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'et': {'average_text_length': 10898.4312, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'lv': {'average_text_length': 10938.5102, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}, 'mt': {'average_text_length': 12589.7442, 'average_label_per_text': 3.5938, 'num_samples': 5000, 'unique_labels': 21, 'labels': {'18': {'count': 2208}, '15': {'count': 1347}, '5': {'count': 1086}, '6': {'count': 1960}, '3': {'count': 2769}, '17': {'count': 1641}, '1': {'count': 653}, '20': {'count': 610}, '0': {'count': 774}, '2': {'count': 974}, '19': {'count': 444}, '9': {'count': 164}, '4': {'count': 394}, '10': {'count': 335}, '11': {'count': 531}, '7': {'count': 622}, '12': {'count': 513}, '8': {'count': 600}, '13': {'count': 102}, '14': {'count': 185}, '16': {'count': 57}}}}}} | -| [MultiHateClassification](https://aclanthology.org/2022.woah-1.15/) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'nld', 'pol', 'por', 'spa'] | Classification | s2s | [Constructed, Written] | {'test': 10000} | {'test': 45.9} | -| [MultiLongDocRetrieval](https://arxiv.org/abs/2402.03216) (Jianlv Chen, 2024) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'jpn', 'kor', 'por', 'rus', 'spa', 'tha'] | Retrieval | s2p | [Encyclopaedic, Written, Web, Non-fiction, Fiction] | None | {'dev': {'ar': {'average_document_length': 29234.48153016958, 'average_query_length': 69.27, 'num_documents': 7607, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'de': {'average_document_length': 33771.2111, 'average_query_length': 153.63, 'num_documents': 10000, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'en': {'average_document_length': 13332.76764, 'average_query_length': 81.22, 'num_documents': 200000, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'es': {'average_document_length': 36567.1736990891, 'average_query_length': 123.11, 'num_documents': 9551, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'fr': {'average_document_length': 36009.4934, 'average_query_length': 142.165, 'num_documents': 10000, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'hi': {'average_document_length': 18688.50788229112, 'average_query_length': 77.995, 'num_documents': 3806, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'it': {'average_document_length': 36633.9969, 'average_query_length': 99.615, 'num_documents': 10000, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'ja': {'average_document_length': 14480.7508, 'average_query_length': 61.625, 'num_documents': 10000, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'ko': {'average_document_length': 13813.441224093263, 'average_query_length': 58.845, 'num_documents': 6176, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'pt': {'average_document_length': 32127.576952351956, 'average_query_length': 122.275, 'num_documents': 6569, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'ru': {'average_document_length': 35934.8756, 'average_query_length': 87.875, 'num_documents': 10000, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'th': {'average_document_length': 25993.2696, 'average_query_length': 107.81, 'num_documents': 10000, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'zh': {'average_document_length': 6039.059725, 'average_query_length': 26.79, 'num_documents': 200000, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}}, 'test': {'ar': {'average_document_length': 29234.48153016958, 'average_query_length': 75.77, 'num_documents': 7607, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'de': {'average_document_length': 33771.2111, 'average_query_length': 123.65, 'num_documents': 10000, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'en': {'average_document_length': 13332.76764, 'average_query_length': 81.33, 'num_documents': 200000, 'num_queries': 800, 'average_relevant_docs_per_query': 1.0}, 'es': {'average_document_length': 36567.1736990891, 'average_query_length': 131.985, 'num_documents': 9551, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'fr': {'average_document_length': 36009.4934, 'average_query_length': 149.795, 'num_documents': 10000, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'hi': {'average_document_length': 18688.50788229112, 'average_query_length': 103.76, 'num_documents': 3806, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'it': {'average_document_length': 36633.9969, 'average_query_length': 114.595, 'num_documents': 10000, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'ja': {'average_document_length': 14480.7508, 'average_query_length': 55.73, 'num_documents': 10000, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'ko': {'average_document_length': 13813.441224093263, 'average_query_length': 58.72, 'num_documents': 6176, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'pt': {'average_document_length': 32127.576952351956, 'average_query_length': 113.455, 'num_documents': 6569, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'ru': {'average_document_length': 35934.8756, 'average_query_length': 94.87, 'num_documents': 10000, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'th': {'average_document_length': 25993.2696, 'average_query_length': 97.99, 'num_documents': 10000, 'num_queries': 200, 'average_relevant_docs_per_query': 1.0}, 'zh': {'average_document_length': 6039.059725, 'average_query_length': 24.70875, 'num_documents': 200000, 'num_queries': 800, 'average_relevant_docs_per_query': 1.0}}} | -| [MultilingualSentiment](https://github.com/tyqiangz/multilingual-sentiment-datasets) | ['cmn'] | Classification | s2s | | None | None | -| [MultilingualSentimentClassification](https://huggingface.co/datasets/mteb/multilingual-sentiment-classification) | ['ara', 'bam', 'bul', 'cmn', 'cym', 'deu', 'dza', 'ell', 'eng', 'eus', 'fas', 'fin', 'heb', 'hrv', 'ind', 'jpn', 'kor', 'mlt', 'nor', 'pol', 'rus', 'slk', 'spa', 'tha', 'tur', 'uig', 'urd', 'vie', 'zho'] | Classification | s2s | [Reviews, Written] | {'test': 7000} | {'test': 56} | -| [MyanmarNews](https://huggingface.co/datasets/myanmar_news) (A. H. Khine, 2017) | ['mya'] | Classification | p2p | [News, Written] | {'train': 2048} | {'train': 174.2} | -| [NFCorpus](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | | None | {'test': {'average_document_length': 1589.783925130746, 'average_query_length': 21.764705882352942, 'num_documents': 3633, 'num_queries': 323, 'average_relevant_docs_per_query': 38.18575851393189}} | -| [NFCorpus-PL](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | {'test': {'average_document_length': 1652.1926782273604, 'average_query_length': 24.390092879256965, 'num_documents': 3633, 'num_queries': 323, 'average_relevant_docs_per_query': 38.18575851393189}} | -| [NLPJournalAbsIntroRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | {'test': 404} | {'test': {'average_document_length': 2052.8611111111113, 'average_query_length': 439.2772277227723, 'num_documents': 504, 'num_queries': 404, 'average_relevant_docs_per_query': 1.0}} | -| [NLPJournalTitleAbsRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | {'test': 404} | {'test': {'average_document_length': 441.6746031746032, 'average_query_length': 27.60891089108911, 'num_documents': 504, 'num_queries': 404, 'average_relevant_docs_per_query': 1.0}} | -| [NLPJournalTitleIntroRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | {'test': 404} | {'test': {'average_document_length': 2052.8611111111113, 'average_query_length': 27.60891089108911, 'num_documents': 504, 'num_queries': 404, 'average_relevant_docs_per_query': 1.0}} | -| [NQ](https://ai.google.com/research/NaturalQuestions/) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | | None | {'test': {'average_document_length': 492.2287851281462, 'average_query_length': 48.17902665121669, 'num_documents': 2681468, 'num_queries': 3452, 'average_relevant_docs_per_query': 1.2169756662804172}} | -| [NQ-PL](https://ai.google.com/research/NaturalQuestions/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | {'test': {'average_document_length': 502.14302128535564, 'average_query_length': 48.31662804171495, 'num_documents': 2681468, 'num_queries': 3452, 'average_relevant_docs_per_query': 1.2169756662804172}} | -| [NQ-PLHardNegatives](https://ai.google.com/research/NaturalQuestions/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | {'test': 1000} | {'test': {'average_document_length': 610.7449138094336, 'average_query_length': 48.381, 'num_documents': 184765, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.213}} | -| [NQHardNegatives](https://ai.google.com/research/NaturalQuestions/) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | | {'test': 1000} | {'test': {'average_document_length': 602.7903551179953, 'average_query_length': 47.878, 'num_documents': 198779, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.213}} | -| [NTREXBitextMining](https://huggingface.co/datasets/davidstap/NTREX) | ['afr', 'amh', 'arb', 'aze', 'bak', 'bel', 'bem', 'ben', 'bod', 'bos', 'bul', 'cat', 'ces', 'ckb', 'cym', 'dan', 'deu', 'div', 'dzo', 'ell', 'eng', 'eus', 'ewe', 'fao', 'fas', 'fij', 'fil', 'fin', 'fra', 'fuc', 'gle', 'glg', 'guj', 'hau', 'heb', 'hin', 'hmn', 'hrv', 'hun', 'hye', 'ibo', 'ind', 'isl', 'ita', 'jpn', 'kan', 'kat', 'kaz', 'khm', 'kin', 'kir', 'kmr', 'kor', 'lao', 'lav', 'lit', 'ltz', 'mal', 'mar', 'mey', 'mkd', 'mlg', 'mlt', 'mon', 'mri', 'msa', 'mya', 'nde', 'nep', 'nld', 'nno', 'nob', 'nso', 'nya', 'orm', 'pan', 'pol', 'por', 'prs', 'pus', 'ron', 'rus', 'shi', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'spa', 'sqi', 'srp', 'ssw', 'swa', 'swe', 'tah', 'tam', 'tat', 'tel', 'tgk', 'tha', 'tir', 'ton', 'tsn', 'tuk', 'tur', 'uig', 'ukr', 'urd', 'uzb', 'ven', 'vie', 'wol', 'xho', 'yor', 'yue', 'zho', 'zul'] | BitextMining | s2s | [News, Written] | {'test': 3826252} | {'test': 120} | -| [NYSJudicialEthicsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 292} | {'test': 159.45} | -| [NaijaSenti](https://github.com/hausanlp/NaijaSenti) | ['hau', 'ibo', 'pcm', 'yor'] | Classification | s2s | [Social, Written] | {'test': 4800} | {'test': 72.81} | -| [NarrativeQARetrieval](https://metatext.io/datasets/narrativeqa) (Tomรกลก Koฤiskรฝ, 2017) | ['eng'] | Retrieval | s2p | | None | {'test': {'average_document_length': 326753.5323943662, 'average_query_length': 47.730889457232166, 'num_documents': 355, 'num_queries': 10557, 'average_relevant_docs_per_query': 1.0}} | -| [NepaliNewsClassification](https://github.com/goru001/nlp-for-nepali) | ['nep'] | Classification | s2s | [News, Written] | {'train': 5975, 'test': 1495} | {'train': 196.61, 'test': 196.017} | -| [NeuCLIR2022Retrieval](https://neuclir.github.io/) (Lawrie et al., 2023) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'fas': 2232130, 'zho': 3179323, 'rus': 4627657} | {'test': {'fas': {'average_document_length': 2032.093148525817, 'average_query_length': 85.4298245614035, 'num_documents': 2232016, 'num_queries': 114, 'average_relevant_docs_per_query': 12.912280701754385}, 'rus': {'average_document_length': 1757.9129983233004, 'average_query_length': 85.58771929824562, 'num_documents': 4627543, 'num_queries': 114, 'average_relevant_docs_per_query': 16.57017543859649}, 'zho': {'average_document_length': 743.1426659901881, 'average_query_length': 24.17543859649123, 'num_documents': 3179209, 'num_queries': 114, 'average_relevant_docs_per_query': 18.710526315789473}}} | -| [NeuCLIR2022RetrievalHardNegatives](https://neuclir.github.io/) (Lawrie et al., 2023) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | {'test': {'average_document_length': 2066.9453653646488, 'average_query_length': 63.529411764705884, 'num_documents': 27931, 'num_queries': 136, 'average_relevant_docs_per_query': 40.39705882352941, 'hf_subset_descriptive_stats': {'fas': {'average_document_length': 2816.847782031074, 'average_query_length': 83.26666666666667, 'num_documents': 8882, 'num_queries': 45, 'average_relevant_docs_per_query': 32.71111111111111}, 'rus': {'average_document_length': 2446.5574277854193, 'average_query_length': 85.56818181818181, 'num_documents': 8724, 'num_queries': 44, 'average_relevant_docs_per_query': 42.93181818181818}, 'zho': {'average_document_length': 1101.0984987893462, 'average_query_length': 24.0, 'num_documents': 10325, 'num_queries': 47, 'average_relevant_docs_per_query': 45.38297872340426}}}} | -| [NeuCLIR2023Retrieval](https://neuclir.github.io/) (Dawn Lawrie, 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'fas': 2232092, 'zho': 3179285, 'rus': 4627619} | {'test': {'fas': {'average_document_length': 2032.093148525817, 'average_query_length': 65.48684210526316, 'num_documents': 2232016, 'num_queries': 76, 'average_relevant_docs_per_query': 66.28947368421052}, 'rus': {'average_document_length': 1757.9129983233004, 'average_query_length': 74.4342105263158, 'num_documents': 4627543, 'num_queries': 76, 'average_relevant_docs_per_query': 62.223684210526315}, 'zho': {'average_document_length': 743.1426659901881, 'average_query_length': 22.210526315789473, 'num_documents': 3179209, 'num_queries': 76, 'average_relevant_docs_per_query': 53.68421052631579}}} | -| [NeuCLIR2023RetrievalHardNegatives](https://neuclir.github.io/) (Dawn Lawrie, 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | {'test': {'average_document_length': 2236.175955333482, 'average_query_length': 54.10267857142857, 'num_documents': 49433, 'num_queries': 224, 'average_relevant_docs_per_query': 61.816964285714285, 'hf_subset_descriptive_stats': {'fas': {'average_document_length': 2895.869857421016, 'average_query_length': 65.89189189189189, 'num_documents': 15921, 'num_queries': 74, 'average_relevant_docs_per_query': 68.08108108108108}, 'rus': {'average_document_length': 2724.294762109928, 'average_query_length': 74.41333333333333, 'num_documents': 16247, 'num_queries': 75, 'average_relevant_docs_per_query': 63.053333333333335}, 'zho': {'average_document_length': 1168.4984071821605, 'average_query_length': 22.16, 'num_documents': 17265, 'num_queries': 75, 'average_relevant_docs_per_query': 54.4}}}} | -| [News21InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionRetrieval | s2p | [News, Written] | {'eng': 61906} | {'eng': 2983.724665391969} | -| [NewsClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [News, Written] | {'test': 7600} | {'test': 235.29} | -| [NoRecClassification](https://aclanthology.org/L18-1661/) | ['nob'] | Classification | s2s | [Written, Reviews] | {'test': 2050} | {'test': 82} | -| [NollySentiBitextMining](https://github.com/IyanuSh/NollySenti) (Shode et al., 2023) | ['eng', 'hau', 'ibo', 'pcm', 'yor'] | BitextMining | s2s | [Social, Reviews, Written] | {'train': 1640} | {'train': 135.91} | -| [NorQuadRetrieval](https://aclanthology.org/2023.nodalida-1.17/) | ['nob'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Written] | {'test': 2602} | {'test': {'average_document_length': 214.5114503816794, 'average_query_length': 47.896484375, 'num_documents': 1048, 'num_queries': 1024, 'average_relevant_docs_per_query': 2.0}} | -| [NordicLangClassification](https://aclanthology.org/2021.vardial-1.8/) | ['dan', 'fao', 'isl', 'nno', 'nob', 'swe'] | Classification | s2s | [Encyclopaedic] | {'test': 3000} | {'test': 78.2} | -| [NorwegianCourtsBitextMining](https://opus.nlpl.eu/index.php) (Tiedemann et al., 2020) | ['nno', 'nob'] | BitextMining | s2s | [Legal, Written] | {'test': 2050} | {'test': 1884.0} | -| [NorwegianParliamentClassification](https://huggingface.co/datasets/NbAiLab/norwegian_parliament) | ['nob'] | Classification | s2s | [Government, Spoken] | {'test': 1200, 'validation': 1200} | {'test': 1884.0, 'validation': 1911.0} | -| [NusaParagraphEmotionClassification](https://github.com/IndoNLP/nusa-writes) | ['bbc', 'bew', 'bug', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | Classification | s2s | [Non-fiction, Fiction, Written] | {'train': 15516, 'validation': 2948, 'test': 6250} | {'train': 740.24, 'validation': 740.66, 'test': 740.71} | -| [NusaParagraphTopicClassification](https://github.com/IndoNLP/nusa-writes) | ['bbc', 'bew', 'bug', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | Classification | s2s | [Non-fiction, Fiction, Written] | {'train': 15516, 'validation': 2948, 'test': 6250} | {'train': 740.24, 'validation': 740.66, 'test': 740.71} | -| [NusaTranslationBitextMining](https://huggingface.co/datasets/indonlp/nusatranslation_mt) (Cahyawijaya et al., 2023) | ['abs', 'bbc', 'bew', 'bhp', 'ind', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | BitextMining | s2s | [Social, Written] | {'train': 50200} | {'train': {'average_sentence1_length': 145.4552390438247, 'average_sentence2_length': 148.56607569721115, 'num_samples': 50200, 'hf_subset_descriptive_stats': {'ind-abs': {'average_sentence1_length': 148.366, 'average_sentence2_length': 147.314, 'num_samples': 1000}, 'ind-btk': {'average_sentence1_length': 145.36666666666667, 'average_sentence2_length': 146.74045454545455, 'num_samples': 6600}, 'ind-bew': {'average_sentence1_length': 145.4280303030303, 'average_sentence2_length': 148.40530303030303, 'num_samples': 6600}, 'ind-bhp': {'average_sentence1_length': 133.528, 'average_sentence2_length': 128.138, 'num_samples': 1000}, 'ind-jav': {'average_sentence1_length': 145.42772727272728, 'average_sentence2_length': 145.8089393939394, 'num_samples': 6600}, 'ind-mad': {'average_sentence1_length': 145.35545454545453, 'average_sentence2_length': 153.6228787878788, 'num_samples': 6600}, 'ind-mak': {'average_sentence1_length': 145.42772727272728, 'average_sentence2_length': 150.6128787878788, 'num_samples': 6600}, 'ind-min': {'average_sentence1_length': 145.42772727272728, 'average_sentence2_length': 148.0621212121212, 'num_samples': 6600}, 'ind-mui': {'average_sentence1_length': 150.454, 'average_sentence2_length': 150.994, 'num_samples': 1000}, 'ind-rej': {'average_sentence1_length': 151.622, 'average_sentence2_length': 139.583, 'num_samples': 1000}, 'ind-sun': {'average_sentence1_length': 145.42772727272728, 'average_sentence2_length': 150.9880303030303, 'num_samples': 6600}}}} | -| [NusaX-senti](https://arxiv.org/abs/2205.15960) (Winata et al., 2022) | ['ace', 'ban', 'bbc', 'bjn', 'bug', 'eng', 'ind', 'jav', 'mad', 'min', 'nij', 'sun'] | Classification | s2s | [Reviews, Web, Social, Constructed, Written] | {'test': 4800} | {'test': 52.4} | -| [NusaXBitextMining](https://huggingface.co/datasets/indonlp/NusaX-senti/) (Winata et al., 2023) | ['ace', 'ban', 'bbc', 'bjn', 'bug', 'eng', 'ind', 'jav', 'mad', 'min', 'nij', 'sun'] | BitextMining | s2s | [Reviews, Written] | {'train': 5500} | {'train': 157.15} | -| [OPP115DataRetentionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 88} | {'test': 195.2} | -| [OPP115DataSecurityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 1334} | {'test': 246.69} | -| [OPP115DoNotTrackLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 110} | {'test': 223.16} | -| [OPP115FirstPartyCollectionUseLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 2086} | {'test': 204.25} | -| [OPP115InternationalAndSpecificAudiencesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 980} | {'test': 327.71} | -| [OPP115PolicyChangeLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 431} | {'test': 200.99} | -| [OPP115ThirdPartySharingCollectionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 1590} | {'test': 223.64} | -| [OPP115UserAccessEditAndDeletionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 462} | {'test': 218.59} | -| [OPP115UserChoiceControlLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 1546} | {'test': 210.62} | -======= | [MassiveIntentClassification](https://arxiv.org/abs/2204.08582) (Jack FitzGerald, 2022) | ['afr', 'amh', 'ara', 'aze', 'ben', 'cmo', 'cym', 'dan', 'deu', 'ell', 'eng', 'fas', 'fin', 'fra', 'heb', 'hin', 'hun', 'hye', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kan', 'kat', 'khm', 'kor', 'lav', 'mal', 'mon', 'msa', 'mya', 'nld', 'nob', 'pol', 'por', 'ron', 'rus', 'slv', 'spa', 'sqi', 'swa', 'swe', 'tam', 'tel', 'tgl', 'tha', 'tur', 'urd', 'vie'] | Classification | s2s | [Spoken] | None | None | | [MassiveScenarioClassification](https://arxiv.org/abs/2204.08582) (Jack FitzGerald, 2022) | ['afr', 'amh', 'ara', 'aze', 'ben', 'cmo', 'cym', 'dan', 'deu', 'ell', 'eng', 'fas', 'fin', 'fra', 'heb', 'hin', 'hun', 'hye', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kan', 'kat', 'khm', 'kor', 'lav', 'mal', 'mon', 'msa', 'mya', 'nld', 'nob', 'pol', 'por', 'ron', 'rus', 'slv', 'spa', 'sqi', 'swa', 'swe', 'tam', 'tel', 'tgl', 'tha', 'tur', 'urd', 'vie'] | Classification | s2s | [Spoken] | None | None | | [MedicalQARetrieval](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-3119-4) (Asma et al., 2019) | ['eng'] | Retrieval | s2s | [Medical, Written] | None | None | | [MedicalRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | None | | [MedrxivClusteringP2P.v2](https://api.medrxiv.org/) | ['eng'] | Clustering | p2p | [Academic, Medical, Written] | {'test': 37500} | {'test': {'num_samples': 37500, 'number_of_characters': 74294927, 'min_text_length': 148, 'average_text_length': 1981.2, 'max_text_length': 38759, 'min_labels_per_text': 6, 'average_labels_per_text': 1.0, 'max_labels_per_text': 8830, 'unique_labels': 51, 'labels': {'epidemiology': {'count': 6656}, 'public and global health': {'count': 3595}, 'oncology': {'count': 845}, 'allergy and immunology': {'count': 464}, 'orthopedics': {'count': 104}, 'health informatics': {'count': 1107}, 'occupational and environmental health': {'count': 415}, 'infectious diseases': {'count': 8830}, 'genetic and genomic medicine': {'count': 1918}, 'health policy': {'count': 527}, 'gastroenterology': {'count': 343}, 'radiology and imaging': {'count': 541}, 'pain medicine': {'count': 121}, 'neurology': {'count': 1773}, 'primary care research': {'count': 232}, 'rheumatology': {'count': 189}, 'endocrinology': {'count': 419}, 'hematology': {'count': 202}, 'addiction medicine': {'count': 178}, 'pediatrics': {'count': 589}, 'cardiovascular medicine': {'count': 855}, 'obstetrics and gynecology': {'count': 373}, 'health systems and quality improvement': {'count': 491}, 'nephrology': {'count': 241}, 'respiratory medicine': {'count': 482}, 'geriatric medicine': {'count': 169}, 'dentistry and oral medicine': {'count': 159}, 'psychiatry and clinical psychology': {'count': 1781}, 'nutrition': {'count': 240}, 'intensive care and critical care medicine': {'count': 368}, 'rehabilitation medicine and physical therapy': {'count': 322}, 'otolaryngology': {'count': 166}, 'nursing': {'count': 93}, 'transplantation': {'count': 118}, 'health economics': {'count': 327}, 'sports medicine': {'count': 180}, 'hiv aids': {'count': 363}, 'dermatology': {'count': 98}, 'pathology': {'count': 223}, 'emergency medicine': {'count': 191}, 'pharmacology and therapeutics': {'count': 221}, 'ophthalmology': {'count': 220}, 'medical ethics': {'count': 46}, 'palliative medicine': {'count': 45}, 'sexual and reproductive health': {'count': 156}, 'medical education': {'count': 203}, 'surgery': {'count': 162}, 'urology': {'count': 65}, 'anesthesia': {'count': 72}, 'toxicology': {'count': 16}, 'forensic medicine': {'count': 6}}}} | | [MedrxivClusteringS2S.v2](https://api.medrxiv.org/) | ['eng'] | Clustering | s2s | [Academic, Medical, Written] | {'test': 37500} | {'test': {'num_samples': 37500, 'number_of_characters': 4301276, 'min_text_length': 18, 'average_text_length': 114.7, 'max_text_length': 339, 'min_labels_per_text': 6, 'average_labels_per_text': 1.0, 'max_labels_per_text': 8830, 'unique_labels': 51, 'labels': {'epidemiology': {'count': 6656}, 'public and global health': {'count': 3595}, 'oncology': {'count': 845}, 'allergy and immunology': {'count': 464}, 'orthopedics': {'count': 104}, 'health informatics': {'count': 1107}, 'occupational and environmental health': {'count': 415}, 'infectious diseases': {'count': 8830}, 'genetic and genomic medicine': {'count': 1918}, 'health policy': {'count': 527}, 'gastroenterology': {'count': 343}, 'radiology and imaging': {'count': 541}, 'pain medicine': {'count': 121}, 'neurology': {'count': 1773}, 'primary care research': {'count': 232}, 'rheumatology': {'count': 189}, 'endocrinology': {'count': 419}, 'hematology': {'count': 202}, 'addiction medicine': {'count': 178}, 'pediatrics': {'count': 589}, 'cardiovascular medicine': {'count': 855}, 'obstetrics and gynecology': {'count': 373}, 'health systems and quality improvement': {'count': 491}, 'nephrology': {'count': 241}, 'respiratory medicine': {'count': 482}, 'geriatric medicine': {'count': 169}, 'dentistry and oral medicine': {'count': 159}, 'psychiatry and clinical psychology': {'count': 1781}, 'nutrition': {'count': 240}, 'intensive care and critical care medicine': {'count': 368}, 'rehabilitation medicine and physical therapy': {'count': 322}, 'otolaryngology': {'count': 166}, 'nursing': {'count': 93}, 'transplantation': {'count': 118}, 'health economics': {'count': 327}, 'sports medicine': {'count': 180}, 'hiv aids': {'count': 363}, 'dermatology': {'count': 98}, 'pathology': {'count': 223}, 'emergency medicine': {'count': 191}, 'pharmacology and therapeutics': {'count': 221}, 'ophthalmology': {'count': 220}, 'medical ethics': {'count': 46}, 'palliative medicine': {'count': 45}, 'sexual and reproductive health': {'count': 156}, 'medical education': {'count': 203}, 'surgery': {'count': 162}, 'urology': {'count': 65}, 'anesthesia': {'count': 72}, 'toxicology': {'count': 16}, 'forensic medicine': {'count': 6}}}} | +| [MemotionI2TRetrieval](https://aclanthology.org/2020.semeval-1.99/) (Sharma et al., 2020) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | None | None | +| [MemotionT2IRetrieval](https://aclanthology.org/2020.semeval-1.99/) (Sharma et al., 2020) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | None | None | | [MewsC16JaClustering](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Clustering | s2s | [News, Written] | None | None | | [MindSmallReranking](https://msnews.github.io/assets/doc/ACL2020_MIND.pdf) | ['eng'] | Reranking | s2s | [News, Written] | None | None | | MintakaRetrieval | ['ara', 'deu', 'fra', 'hin', 'ita', 'jpn', 'por', 'spa'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [Moroco](https://huggingface.co/datasets/moroco) (Andrei M. Butnaru, 2019) | ['ron'] | Classification | s2s | [News, Written] | None | None | | [MovieReviewSentimentClassification](https://github.com/TheophileBlard/french-sentiment-analysis-with-bert) (Thรฉophile Blard, 2020) | ['fra'] | Classification | s2s | [Reviews, Written] | None | None | | [MrTidyRetrieval](https://huggingface.co/datasets/castorini/mr-tydi) (Xinyu Zhang, 2021) | ['ara', 'ben', 'eng', 'fin', 'ind', 'jpn', 'kor', 'rus', 'swa', 'tel', 'tha'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | -| [MultiEURLEXMultilabelClassification](https://huggingface.co/datasets/coastalcph/multi_eurlex) (Chalkidis et al., 2021) | ['bul', 'ces', 'dan', 'deu', 'ell', 'eng', 'est', 'fin', 'fra', 'hrv', 'hun', 'ita', 'lav', 'lit', 'mlt', 'nld', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe'] | MultilabelClassification | p2p | [Legal, Government, Written] | None | None | +| [MultiEURLEXMultilabelClassification](https://huggingface.co/datasets/coastalcph/multi_eurlex) (Chalkidis et al., 2021) | ['bul', 'ces', 'dan', 'deu', 'ell', 'eng', 'est', 'fin', 'fra', 'hrv', 'hun', 'ita', 'lav', 'lit', 'mlt', 'nld', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe'] | MultilabelClassification | p2p | [Government, Legal, Written] | None | None | | [MultiHateClassification](https://aclanthology.org/2022.woah-1.15/) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'nld', 'pol', 'por', 'spa'] | Classification | s2s | [Constructed, Written] | None | None | -| [MultiLongDocRetrieval](https://arxiv.org/abs/2402.03216) (Jianlv Chen, 2024) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'jpn', 'kor', 'por', 'rus', 'spa', 'tha'] | Retrieval | s2p | [Encyclopaedic, Written, Web, Non-fiction, Fiction] | None | None | +| [MultiLongDocRetrieval](https://arxiv.org/abs/2402.03216) (Jianlv Chen, 2024) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'jpn', 'kor', 'por', 'rus', 'spa', 'tha'] | Retrieval | s2p | [Encyclopaedic, Fiction, Non-fiction, Web, Written] | None | None | | [MultilingualSentiment](https://github.com/tyqiangz/multilingual-sentiment-datasets) | ['cmn'] | Classification | s2s | | None | None | | [MultilingualSentimentClassification](https://huggingface.co/datasets/mteb/multilingual-sentiment-classification) | ['ara', 'bam', 'bul', 'cmn', 'cym', 'deu', 'dza', 'ell', 'eng', 'eus', 'fas', 'fin', 'heb', 'hrv', 'ind', 'jpn', 'kor', 'mlt', 'nor', 'pol', 'rus', 'slk', 'spa', 'tha', 'tur', 'uig', 'urd', 'vie', 'zho'] | Classification | s2s | [Reviews, Written] | None | None | | [MyanmarNews](https://huggingface.co/datasets/myanmar_news) (A. H. Khine, 2017) | ['mya'] | Classification | p2p | [News, Written] | None | None | -| [NFCorpus](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Medical, Academic, Written] | {'test': 3956} | {'test': {'number_of_characters': 1612.55, 'num_samples': 3956, 'num_queries': 323, 'num_documents': 3633, 'average_document_length': 0.44, 'average_query_length': 0.07, 'average_relevant_docs_per_query': 38.19}} | +| [NFCorpus](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | {'test': 3956} | {'test': {'number_of_characters': 1612.55, 'num_samples': 3956, 'num_queries': 323, 'num_documents': 3633, 'average_document_length': 0.44, 'average_query_length': 0.07, 'average_relevant_docs_per_query': 38.19}} | +| [NFCorpus-Fa](https://huggingface.co/datasets/MCINext/nfcorpus-fa) | ['fas'] | Retrieval | s2p | [Medical] | None | None | | [NFCorpus-PL](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | +| [NIGHTSI2IRetrieval](https://proceedings.neurips.cc/paper_files/paper/2023/hash/9f09f316a3eaf59d9ced5ffaefe97e0f-Abstract-Conference.html) (Fu et al., 2024) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | None | None | | [NLPJournalAbsIntroRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | | [NLPJournalTitleAbsRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | | [NLPJournalTitleIntroRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | -| [NQ](https://ai.google.com/research/NaturalQuestions/) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | | None | None | +| [NLPTwitterAnalysisClassification](https://huggingface.co/datasets/hamedhf/nlp_twitter_analysis/tree/main) | ['fas'] | Classification | s2p | [Social] | None | None | +| [NLPTwitterAnalysisClustering](https://huggingface.co/datasets/hamedhf/nlp_twitter_analysis/commits/main) | ['fas'] | Clustering | s2s | [Social] | None | None | +| [NQ](https://ai.google.com/research/NaturalQuestions/) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [NQ-Fa](https://huggingface.co/datasets/MCINext/nq-fa) | ['fas'] | Retrieval | s2p | [Encyclopaedic] | None | None | | [NQ-PL](https://ai.google.com/research/NaturalQuestions/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | | [NQ-PLHardNegatives](https://ai.google.com/research/NaturalQuestions/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | | [NQHardNegatives](https://ai.google.com/research/NaturalQuestions/) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | | None | None | @@ -693,16 +469,16 @@ The following tables give you an overview of the tasks in MTEB. | [NaijaSenti](https://github.com/hausanlp/NaijaSenti) | ['hau', 'ibo', 'pcm', 'yor'] | Classification | s2s | [Social, Written] | None | None | | [NamaaMrTydiReranking](https://huggingface.co/NAMAA-Space) (Muennighoff et al., 2022) | ['ara'] | Reranking | s2s | [Encyclopaedic, Written] | None | None | | [NanoArguAnaRetrieval](http://argumentation.bplaced.net/arguana/data) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Medical, Written] | None | None | -| [NanoClimateFeverRetrieval](https://arxiv.org/abs/2012.00614) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Non-fiction, Academic, News] | None | None | +| [NanoClimateFeverRetrieval](https://arxiv.org/abs/2012.00614) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Academic, News, Non-fiction] | None | None | | [NanoDBPediaRetrieval](https://huggingface.co/datasets/zeta-alpha-ai/NanoDBPedia) (Lehmann et al., 2015) | ['eng'] | Retrieval | s2p | [Encyclopaedic] | None | None | | [NanoFEVERRetrieval](https://fever.ai/) | ['eng'] | Retrieval | s2p | [Academic, Encyclopaedic] | None | None | | [NanoFiQA2018Retrieval](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['eng'] | Retrieval | s2p | [Academic, Social] | None | None | | [NanoHotpotQARetrieval](https://hotpotqa.github.io/) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | | [NanoMSMARCORetrieval](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Web] | None | None | -| [NanoNFCorpusRetrieval](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Medical, Academic, Written] | None | None | +| [NanoNFCorpusRetrieval](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [NanoNQRetrieval](https://ai.google.com/research/NaturalQuestions) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | [Academic, Web] | None | None | | [NanoQuoraRetrieval](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | [Social] | None | None | -| [NanoSCIDOCSRetrieval](https://allenai.org/data/scidocs) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Written, Non-fiction] | None | None | +| [NanoSCIDOCSRetrieval](https://allenai.org/data/scidocs) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | | [NanoSciFactRetrieval](https://github.com/allenai/scifact) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [NanoTouche2020Retrieval](https://webis.de/events/touche-20/shared-task-1.html) | ['eng'] | Retrieval | s2p | [Academic] | None | None | | [NarrativeQARetrieval](https://metatext.io/datasets/narrativeqa) (Tomรกลก Koฤiskรฝ, 2017) | ['eng'] | Retrieval | s2p | | None | None | @@ -713,17 +489,18 @@ The following tables give you an overview of the tasks in MTEB. | [NeuCLIR2023RetrievalHardNegatives](https://neuclir.github.io/) (Dawn Lawrie, 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | None | | [News21InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionRetrieval | s2p | [News, Written] | None | None | | [NewsClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [News, Written] | None | None | -| [NoRecClassification](https://aclanthology.org/L18-1661/) | ['nob'] | Classification | s2s | [Written, Reviews] | None | None | -| [NollySentiBitextMining](https://github.com/IyanuSh/NollySenti) (Shode et al., 2023) | ['eng', 'hau', 'ibo', 'pcm', 'yor'] | BitextMining | s2s | [Social, Reviews, Written] | {'train': 1640} | {'train': {'num_samples': 1640, 'number_of_characters': 445805, 'unique_pairs': 1632, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 3, 'average_sentence2_length': 135.52, 'max_sentence2_length': 1728, 'unique_sentence2': 1631, 'hf_subset_descriptive_stats': {'en-ha': {'num_samples': 410, 'number_of_characters': 115348, 'unique_pairs': 407, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 4, 'average_sentence2_length': 145.02, 'max_sentence2_length': 1728, 'unique_sentence2': 407}, 'en-ig': {'num_samples': 410, 'number_of_characters': 107173, 'unique_pairs': 409, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 5, 'average_sentence2_length': 125.08, 'max_sentence2_length': 1137, 'unique_sentence2': 408}, 'en-pcm': {'num_samples': 410, 'number_of_characters': 109955, 'unique_pairs': 408, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 3, 'average_sentence2_length': 131.87, 'max_sentence2_length': 1552, 'unique_sentence2': 408}, 'en-yo': {'num_samples': 410, 'number_of_characters': 113329, 'unique_pairs': 409, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 6, 'average_sentence2_length': 140.1, 'max_sentence2_length': 1338, 'unique_sentence2': 409}}}} | +| [NoRecClassification](https://aclanthology.org/L18-1661/) | ['nob'] | Classification | s2s | [Reviews, Written] | None | None | +| [NollySentiBitextMining](https://github.com/IyanuSh/NollySenti) (Shode et al., 2023) | ['eng', 'hau', 'ibo', 'pcm', 'yor'] | BitextMining | s2s | [Reviews, Social, Written] | {'train': 1640} | {'train': {'num_samples': 1640, 'number_of_characters': 445805, 'unique_pairs': 1632, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 3, 'average_sentence2_length': 135.52, 'max_sentence2_length': 1728, 'unique_sentence2': 1631, 'hf_subset_descriptive_stats': {'en-ha': {'num_samples': 410, 'number_of_characters': 115348, 'unique_pairs': 407, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 4, 'average_sentence2_length': 145.02, 'max_sentence2_length': 1728, 'unique_sentence2': 407}, 'en-ig': {'num_samples': 410, 'number_of_characters': 107173, 'unique_pairs': 409, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 5, 'average_sentence2_length': 125.08, 'max_sentence2_length': 1137, 'unique_sentence2': 408}, 'en-pcm': {'num_samples': 410, 'number_of_characters': 109955, 'unique_pairs': 408, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 3, 'average_sentence2_length': 131.87, 'max_sentence2_length': 1552, 'unique_sentence2': 408}, 'en-yo': {'num_samples': 410, 'number_of_characters': 113329, 'unique_pairs': 409, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 6, 'average_sentence2_length': 140.1, 'max_sentence2_length': 1338, 'unique_sentence2': 409}}}} | | [NorQuadRetrieval](https://aclanthology.org/2023.nodalida-1.17/) | ['nob'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Written] | None | None | | [NordicLangClassification](https://aclanthology.org/2021.vardial-1.8/) | ['dan', 'fao', 'isl', 'nno', 'nob', 'swe'] | Classification | s2s | [Encyclopaedic] | None | None | | [NorwegianCourtsBitextMining](https://opus.nlpl.eu/index.php) (Tiedemann et al., 2020) | ['nno', 'nob'] | BitextMining | s2s | [Legal, Written] | {'test': 228} | {'test': {'num_samples': 228, 'number_of_characters': 37441, 'unique_pairs': 228, 'min_sentence1_length': 13, 'average_sentence1_length': 82.2, 'max_sentence1_length': 272, 'unique_sentence1': 227, 'min_sentence2_length': 10, 'average_sentence2_length': 82.02, 'max_sentence2_length': 269, 'unique_sentence2': 226}} | | [NorwegianParliamentClassification](https://huggingface.co/datasets/NbAiLab/norwegian_parliament) | ['nob'] | Classification | s2s | [Government, Spoken] | None | None | -| [NusaParagraphEmotionClassification](https://github.com/IndoNLP/nusa-writes) | ['bbc', 'bew', 'bug', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | Classification | s2s | [Non-fiction, Fiction, Written] | None | None | -| [NusaParagraphTopicClassification](https://github.com/IndoNLP/nusa-writes) | ['bbc', 'bew', 'bug', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | Classification | s2s | [Non-fiction, Fiction, Written] | None | None | +| [NusaParagraphEmotionClassification](https://github.com/IndoNLP/nusa-writes) | ['bbc', 'bew', 'bug', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | Classification | s2s | [Fiction, Non-fiction, Written] | None | None | +| [NusaParagraphTopicClassification](https://github.com/IndoNLP/nusa-writes) | ['bbc', 'bew', 'bug', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | Classification | s2s | [Fiction, Non-fiction, Written] | None | None | | [NusaTranslationBitextMining](https://huggingface.co/datasets/indonlp/nusatranslation_mt) (Cahyawijaya et al., 2023) | ['abs', 'bbc', 'bew', 'bhp', 'ind', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | BitextMining | s2s | [Social, Written] | {'train': 50200} | {'train': {'num_samples': 50200, 'number_of_characters': 14759870, 'unique_pairs': 50140, 'min_sentence1_length': 5, 'average_sentence1_length': 145.46, 'max_sentence1_length': 873, 'unique_sentence1': 8258, 'min_sentence2_length': 5, 'average_sentence2_length': 148.57, 'max_sentence2_length': 980, 'unique_sentence2': 50102, 'hf_subset_descriptive_stats': {'ind-abs': {'num_samples': 1000, 'number_of_characters': 295680, 'unique_pairs': 999, 'min_sentence1_length': 5, 'average_sentence1_length': 148.37, 'max_sentence1_length': 727, 'unique_sentence1': 998, 'min_sentence2_length': 6, 'average_sentence2_length': 147.31, 'max_sentence2_length': 629, 'unique_sentence2': 998}, 'ind-btk': {'num_samples': 6600, 'number_of_characters': 1927907, 'unique_pairs': 6597, 'min_sentence1_length': 5, 'average_sentence1_length': 145.37, 'max_sentence1_length': 873, 'unique_sentence1': 6521, 'min_sentence2_length': 5, 'average_sentence2_length': 146.74, 'max_sentence2_length': 980, 'unique_sentence2': 6596}, 'ind-bew': {'num_samples': 6600, 'number_of_characters': 1939300, 'unique_pairs': 6595, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 6, 'average_sentence2_length': 148.41, 'max_sentence2_length': 840, 'unique_sentence2': 6590}, 'ind-bhp': {'num_samples': 1000, 'number_of_characters': 261666, 'unique_pairs': 1000, 'min_sentence1_length': 11, 'average_sentence1_length': 133.53, 'max_sentence1_length': 468, 'unique_sentence1': 999, 'min_sentence2_length': 10, 'average_sentence2_length': 128.14, 'max_sentence2_length': 459, 'unique_sentence2': 999}, 'ind-jav': {'num_samples': 6600, 'number_of_characters': 1922162, 'unique_pairs': 6594, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 5, 'average_sentence2_length': 145.81, 'max_sentence2_length': 854, 'unique_sentence2': 6585}, 'ind-mad': {'num_samples': 6600, 'number_of_characters': 1973257, 'unique_pairs': 6598, 'min_sentence1_length': 5, 'average_sentence1_length': 145.36, 'max_sentence1_length': 873, 'unique_sentence1': 6521, 'min_sentence2_length': 5, 'average_sentence2_length': 153.62, 'max_sentence2_length': 827, 'unique_sentence2': 6592}, 'ind-mak': {'num_samples': 6600, 'number_of_characters': 1953868, 'unique_pairs': 6594, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 6, 'average_sentence2_length': 150.61, 'max_sentence2_length': 888, 'unique_sentence2': 6586}, 'ind-min': {'num_samples': 6600, 'number_of_characters': 1937033, 'unique_pairs': 6595, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 6, 'average_sentence2_length': 148.06, 'max_sentence2_length': 837, 'unique_sentence2': 6591}, 'ind-mui': {'num_samples': 1000, 'number_of_characters': 301448, 'unique_pairs': 1000, 'min_sentence1_length': 11, 'average_sentence1_length': 150.45, 'max_sentence1_length': 451, 'unique_sentence1': 997, 'min_sentence2_length': 11, 'average_sentence2_length': 150.99, 'max_sentence2_length': 450, 'unique_sentence2': 1000}, 'ind-rej': {'num_samples': 1000, 'number_of_characters': 291205, 'unique_pairs': 1000, 'min_sentence1_length': 9, 'average_sentence1_length': 151.62, 'max_sentence1_length': 873, 'unique_sentence1': 998, 'min_sentence2_length': 8, 'average_sentence2_length': 139.58, 'max_sentence2_length': 784, 'unique_sentence2': 1000}, 'ind-sun': {'num_samples': 6600, 'number_of_characters': 1956344, 'unique_pairs': 6591, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 5, 'average_sentence2_length': 150.99, 'max_sentence2_length': 881, 'unique_sentence2': 6588}}}} | -| [NusaX-senti](https://arxiv.org/abs/2205.15960) (Winata et al., 2022) | ['ace', 'ban', 'bbc', 'bjn', 'bug', 'eng', 'ind', 'jav', 'mad', 'min', 'nij', 'sun'] | Classification | s2s | [Reviews, Web, Social, Constructed, Written] | None | None | +| [NusaX-senti](https://arxiv.org/abs/2205.15960) (Winata et al., 2022) | ['ace', 'ban', 'bbc', 'bjn', 'bug', 'eng', 'ind', 'jav', 'mad', 'min', 'nij', 'sun'] | Classification | s2s | [Constructed, Reviews, Social, Web, Written] | None | None | | [NusaXBitextMining](https://huggingface.co/datasets/indonlp/NusaX-senti/) (Winata et al., 2023) | ['ace', 'ban', 'bbc', 'bjn', 'bug', 'eng', 'ind', 'jav', 'mad', 'min', 'nij', 'sun'] | BitextMining | s2s | [Reviews, Written] | None | None | +| [OKVQAIT2TRetrieval](https://okvqa.allenai.org/) (Marino et al., 2019) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | None | None | | [OPP115DataRetentionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [OPP115DataSecurityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [OPP115DoNotTrackLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | @@ -733,7 +510,8 @@ The following tables give you an overview of the tasks in MTEB. | [OPP115ThirdPartySharingCollectionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [OPP115UserAccessEditAndDeletionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [OPP115UserChoiceControlLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | ->>>>>>> main +| [OVENIT2ITRetrieval](https://openaccess.thecvf.com/content/ICCV2023/html/Hu_Open-domain_Visual_Entity_Recognition_Towards_Recognizing_Millions_of_Wikipedia_Entities_ICCV_2023_paper.html) (Hu et al., 2023) | ['eng'] | Any2AnyRetrieval | it2it | [Encyclopaedic] | None | None | +| [OVENIT2TRetrieval](https://openaccess.thecvf.com/content/ICCV2023/html/Hu_Open-domain_Visual_Entity_Recognition_Towards_Recognizing_Millions_of_Wikipedia_Entities_ICCV_2023_paper.html) (Hu et al., 2023) | ['eng'] | Any2AnyRetrieval | it2i | [Encyclopaedic] | None | None | | [Ocnli](https://arxiv.org/abs/2010.05444) (Hai Hu, 2020) | ['cmn'] | PairClassification | s2s | | None | None | | [OdiaNewsClassification](https://github.com/goru001/nlp-for-odia) (Anoop Kunchukuttan, 2020) | ['ory'] | Classification | s2s | [News, Written] | None | None | | [OnlineShopping](https://aclanthology.org/2023.nodalida-1.20/) (Xiao et al., 2023) | ['cmn'] | Classification | s2s | | None | None | @@ -741,82 +519,71 @@ The following tables give you an overview of the tasks in MTEB. | [OpusparcusPC](https://gem-benchmark.com/data_cards/opusparcus) (Mathias Creutz, 2018) | ['deu', 'eng', 'fin', 'fra', 'rus', 'swe'] | PairClassification | s2s | [Spoken, Spoken] | None | None | | [OralArgumentQuestionPurposeLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [OverrulingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [OxfordFlowersClassification](https://huggingface.co/datasets/nelorth/oxford-flowers/viewer/default/train) | ['eng'] | ImageClassification | i2i | [Reviews] | None | None | +| [OxfordPets](https://arxiv.org/abs/1306.5151) (Subhransu Maji, 2013) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [OxfordPetsZeroShot](https://arxiv.org/abs/1306.5151) (Subhransu Maji, 2013) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | | [PAC](https://arxiv.org/pdf/2211.13112.pdf) (ลukasz Augustyniak, 2022) | ['pol'] | Classification | p2p | [Legal, Written] | None | None | | [PAWSX](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None | | [PIQA](https://arxiv.org/abs/1911.11641) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [PROALegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [PSC](http://www.lrec-conf.org/proceedings/lrec2014/pdf/1211_Paper.pdf) | ['pol'] | PairClassification | s2s | [News, Written] | None | None | +| [ParsinluEntail](https://github.com/persiannlp/parsinlu) | ['fas'] | PairClassification | s2s | | None | None | +| [ParsinluQueryParaphPC](https://huggingface.co/datasets/persiannlp/parsinlu_query_paraphrasing) | ['fas'] | PairClassification | s2s | | None | None | +| [PatchCamelyon](https://link.springer.com/chapter/10.1007/978-3-030-00934-2_24) | ['eng'] | ImageClassification | i2i | [Medical] | None | None | +| [PatchCamelyonZeroShot](https://link.springer.com/chapter/10.1007/978-3-030-00934-2_24) | ['eng'] | ZeroShotClassification | i2t | [Medical] | None | None | | [PatentClassification](https://aclanthology.org/P19-1212.pdf) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [PawsXPairClassification](https://arxiv.org/abs/1908.11828) (Yinfei Yang, 2019) | ['cmn', 'deu', 'eng', 'fra', 'jpn', 'kor', 'spa'] | PairClassification | s2s | [Web, Encyclopaedic, Written] | {'test': 14000, 'validation': 14000} | {'test': {'num_samples': 14000, 'number_of_characters': 2551922, 'min_sentence1_length': 2, 'avg_sentence1_length': 91.18, 'max_sentence1_length': 268, 'unique_sentence1': 13404, 'min_sentence2_length': 2, 'avg_sentence2_length': 91.1, 'max_sentence2_length': 247, 'unique_sentence2': 13462, 'unique_labels': 2, 'labels': {'1': {'count': 6285}, '0': {'count': 7715}}, 'hf_subset_descriptive_stats': {'de': {'num_samples': 2000, 'number_of_characters': 478034, 'min_sentence1_length': 2, 'avg_sentence1_length': 119.78, 'max_sentence1_length': 268, 'unique_sentence1': 1934, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.24, 'max_sentence2_length': 235, 'unique_sentence2': 1938, 'unique_labels': 2, 'labels': {'1': {'count': 895}, '0': {'count': 1105}}}, 'en': {'num_samples': 2000, 'number_of_characters': 454362, 'min_sentence1_length': 25, 'avg_sentence1_length': 113.76, 'max_sentence1_length': 209, 'unique_sentence1': 1761, 'min_sentence2_length': 25, 'avg_sentence2_length': 113.42, 'max_sentence2_length': 209, 'unique_sentence2': 1800, 'unique_labels': 2, 'labels': {'1': {'count': 907}, '0': {'count': 1093}}}, 'es': {'num_samples': 2000, 'number_of_characters': 471226, 'min_sentence1_length': 2, 'avg_sentence1_length': 117.81, 'max_sentence1_length': 226, 'unique_sentence1': 1955, 'min_sentence2_length': 22, 'avg_sentence2_length': 117.8, 'max_sentence2_length': 233, 'unique_sentence2': 1959, 'unique_labels': 2, 'labels': {'1': {'count': 907}, '0': {'count': 1093}}}, 'fr': {'num_samples': 2000, 'number_of_characters': 480033, 'min_sentence1_length': 2, 'avg_sentence1_length': 120.03, 'max_sentence1_length': 238, 'unique_sentence1': 1954, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.99, 'max_sentence2_length': 247, 'unique_sentence2': 1953, 'unique_labels': 2, 'labels': {'1': {'count': 903}, '0': {'count': 1097}}}, 'ja': {'num_samples': 2000, 'number_of_characters': 235106, 'min_sentence1_length': 2, 'avg_sentence1_length': 58.68, 'max_sentence1_length': 192, 'unique_sentence1': 1944, 'min_sentence2_length': 2, 'avg_sentence2_length': 58.88, 'max_sentence2_length': 198, 'unique_sentence2': 1941, 'unique_labels': 2, 'labels': {'1': {'count': 883}, '0': {'count': 1117}}}, 'ko': {'num_samples': 2000, 'number_of_characters': 260149, 'min_sentence1_length': 2, 'avg_sentence1_length': 64.96, 'max_sentence1_length': 153, 'unique_sentence1': 1954, 'min_sentence2_length': 2, 'avg_sentence2_length': 65.11, 'max_sentence2_length': 159, 'unique_sentence2': 1969, 'unique_labels': 2, 'labels': {'1': {'count': 896}, '0': {'count': 1104}}}, 'zh': {'num_samples': 2000, 'number_of_characters': 173012, 'min_sentence1_length': 2, 'avg_sentence1_length': 43.23, 'max_sentence1_length': 120, 'unique_sentence1': 1909, 'min_sentence2_length': 2, 'avg_sentence2_length': 43.27, 'max_sentence2_length': 113, 'unique_sentence2': 1909, 'unique_labels': 2, 'labels': {'1': {'count': 894}, '0': {'count': 1106}}}}}, 'validation': {'num_samples': 14000, 'number_of_characters': 2524625, 'min_sentence1_length': 2, 'avg_sentence1_length': 90.13, 'max_sentence1_length': 248, 'unique_sentence1': 13357, 'min_sentence2_length': 2, 'avg_sentence2_length': 90.2, 'max_sentence2_length': 275, 'unique_sentence2': 13397, 'unique_labels': 2, 'labels': {'1': {'count': 5948}, '0': {'count': 8052}}, 'hf_subset_descriptive_stats': {'de': {'num_samples': 2000, 'number_of_characters': 467643, 'min_sentence1_length': 2, 'avg_sentence1_length': 116.82, 'max_sentence1_length': 248, 'unique_sentence1': 1914, 'min_sentence2_length': 2, 'avg_sentence2_length': 117.0, 'max_sentence2_length': 275, 'unique_sentence2': 1920, 'unique_labels': 2, 'labels': {'1': {'count': 831}, '0': {'count': 1169}}}, 'en': {'num_samples': 2000, 'number_of_characters': 451931, 'min_sentence1_length': 25, 'avg_sentence1_length': 113.11, 'max_sentence1_length': 213, 'unique_sentence1': 1758, 'min_sentence2_length': 25, 'avg_sentence2_length': 112.86, 'max_sentence2_length': 213, 'unique_sentence2': 1771, 'unique_labels': 2, 'labels': {'1': {'count': 863}, '0': {'count': 1137}}}, 'es': {'num_samples': 2000, 'number_of_characters': 466112, 'min_sentence1_length': 2, 'avg_sentence1_length': 116.33, 'max_sentence1_length': 240, 'unique_sentence1': 1938, 'min_sentence2_length': 2, 'avg_sentence2_length': 116.73, 'max_sentence2_length': 241, 'unique_sentence2': 1941, 'unique_labels': 2, 'labels': {'1': {'count': 847}, '0': {'count': 1153}}}, 'fr': {'num_samples': 2000, 'number_of_characters': 478510, 'min_sentence1_length': 2, 'avg_sentence1_length': 119.5, 'max_sentence1_length': 233, 'unique_sentence1': 1933, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.75, 'max_sentence2_length': 246, 'unique_sentence2': 1939, 'unique_labels': 2, 'labels': {'1': {'count': 860}, '0': {'count': 1140}}}, 'ja': {'num_samples': 2000, 'number_of_characters': 229655, 'min_sentence1_length': 2, 'avg_sentence1_length': 57.51, 'max_sentence1_length': 126, 'unique_sentence1': 1957, 'min_sentence2_length': 2, 'avg_sentence2_length': 57.32, 'max_sentence2_length': 121, 'unique_sentence2': 1969, 'unique_labels': 2, 'labels': {'1': {'count': 854}, '0': {'count': 1146}}}, 'ko': {'num_samples': 2000, 'number_of_characters': 261355, 'min_sentence1_length': 2, 'avg_sentence1_length': 65.16, 'max_sentence1_length': 178, 'unique_sentence1': 1963, 'min_sentence2_length': 2, 'avg_sentence2_length': 65.52, 'max_sentence2_length': 174, 'unique_sentence2': 1968, 'unique_labels': 2, 'labels': {'1': {'count': 840}, '0': {'count': 1160}}}, 'zh': {'num_samples': 2000, 'number_of_characters': 169419, 'min_sentence1_length': 2, 'avg_sentence1_length': 42.45, 'max_sentence1_length': 101, 'unique_sentence1': 1899, 'min_sentence2_length': 2, 'avg_sentence2_length': 42.26, 'max_sentence2_length': 120, 'unique_sentence2': 1895, 'unique_labels': 2, 'labels': {'1': {'count': 853}, '0': {'count': 1147}}}}}} | +| [PawsXPairClassification](https://arxiv.org/abs/1908.11828) (Yinfei Yang, 2019) | ['cmn', 'deu', 'eng', 'fra', 'jpn', 'kor', 'spa'] | PairClassification | s2s | [Encyclopaedic, Web, Written] | {'test': 14000, 'validation': 14000} | {'test': {'num_samples': 14000, 'number_of_characters': 2551922, 'min_sentence1_length': 2, 'avg_sentence1_length': 91.18, 'max_sentence1_length': 268, 'unique_sentence1': 13404, 'min_sentence2_length': 2, 'avg_sentence2_length': 91.1, 'max_sentence2_length': 247, 'unique_sentence2': 13462, 'unique_labels': 2, 'labels': {'1': {'count': 6285}, '0': {'count': 7715}}, 'hf_subset_descriptive_stats': {'de': {'num_samples': 2000, 'number_of_characters': 478034, 'min_sentence1_length': 2, 'avg_sentence1_length': 119.78, 'max_sentence1_length': 268, 'unique_sentence1': 1934, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.24, 'max_sentence2_length': 235, 'unique_sentence2': 1938, 'unique_labels': 2, 'labels': {'1': {'count': 895}, '0': {'count': 1105}}}, 'en': {'num_samples': 2000, 'number_of_characters': 454362, 'min_sentence1_length': 25, 'avg_sentence1_length': 113.76, 'max_sentence1_length': 209, 'unique_sentence1': 1761, 'min_sentence2_length': 25, 'avg_sentence2_length': 113.42, 'max_sentence2_length': 209, 'unique_sentence2': 1800, 'unique_labels': 2, 'labels': {'1': {'count': 907}, '0': {'count': 1093}}}, 'es': {'num_samples': 2000, 'number_of_characters': 471226, 'min_sentence1_length': 2, 'avg_sentence1_length': 117.81, 'max_sentence1_length': 226, 'unique_sentence1': 1955, 'min_sentence2_length': 22, 'avg_sentence2_length': 117.8, 'max_sentence2_length': 233, 'unique_sentence2': 1959, 'unique_labels': 2, 'labels': {'1': {'count': 907}, '0': {'count': 1093}}}, 'fr': {'num_samples': 2000, 'number_of_characters': 480033, 'min_sentence1_length': 2, 'avg_sentence1_length': 120.03, 'max_sentence1_length': 238, 'unique_sentence1': 1954, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.99, 'max_sentence2_length': 247, 'unique_sentence2': 1953, 'unique_labels': 2, 'labels': {'1': {'count': 903}, '0': {'count': 1097}}}, 'ja': {'num_samples': 2000, 'number_of_characters': 235106, 'min_sentence1_length': 2, 'avg_sentence1_length': 58.68, 'max_sentence1_length': 192, 'unique_sentence1': 1944, 'min_sentence2_length': 2, 'avg_sentence2_length': 58.88, 'max_sentence2_length': 198, 'unique_sentence2': 1941, 'unique_labels': 2, 'labels': {'1': {'count': 883}, '0': {'count': 1117}}}, 'ko': {'num_samples': 2000, 'number_of_characters': 260149, 'min_sentence1_length': 2, 'avg_sentence1_length': 64.96, 'max_sentence1_length': 153, 'unique_sentence1': 1954, 'min_sentence2_length': 2, 'avg_sentence2_length': 65.11, 'max_sentence2_length': 159, 'unique_sentence2': 1969, 'unique_labels': 2, 'labels': {'1': {'count': 896}, '0': {'count': 1104}}}, 'zh': {'num_samples': 2000, 'number_of_characters': 173012, 'min_sentence1_length': 2, 'avg_sentence1_length': 43.23, 'max_sentence1_length': 120, 'unique_sentence1': 1909, 'min_sentence2_length': 2, 'avg_sentence2_length': 43.27, 'max_sentence2_length': 113, 'unique_sentence2': 1909, 'unique_labels': 2, 'labels': {'1': {'count': 894}, '0': {'count': 1106}}}}}, 'validation': {'num_samples': 14000, 'number_of_characters': 2524625, 'min_sentence1_length': 2, 'avg_sentence1_length': 90.13, 'max_sentence1_length': 248, 'unique_sentence1': 13357, 'min_sentence2_length': 2, 'avg_sentence2_length': 90.2, 'max_sentence2_length': 275, 'unique_sentence2': 13397, 'unique_labels': 2, 'labels': {'1': {'count': 5948}, '0': {'count': 8052}}, 'hf_subset_descriptive_stats': {'de': {'num_samples': 2000, 'number_of_characters': 467643, 'min_sentence1_length': 2, 'avg_sentence1_length': 116.82, 'max_sentence1_length': 248, 'unique_sentence1': 1914, 'min_sentence2_length': 2, 'avg_sentence2_length': 117.0, 'max_sentence2_length': 275, 'unique_sentence2': 1920, 'unique_labels': 2, 'labels': {'1': {'count': 831}, '0': {'count': 1169}}}, 'en': {'num_samples': 2000, 'number_of_characters': 451931, 'min_sentence1_length': 25, 'avg_sentence1_length': 113.11, 'max_sentence1_length': 213, 'unique_sentence1': 1758, 'min_sentence2_length': 25, 'avg_sentence2_length': 112.86, 'max_sentence2_length': 213, 'unique_sentence2': 1771, 'unique_labels': 2, 'labels': {'1': {'count': 863}, '0': {'count': 1137}}}, 'es': {'num_samples': 2000, 'number_of_characters': 466112, 'min_sentence1_length': 2, 'avg_sentence1_length': 116.33, 'max_sentence1_length': 240, 'unique_sentence1': 1938, 'min_sentence2_length': 2, 'avg_sentence2_length': 116.73, 'max_sentence2_length': 241, 'unique_sentence2': 1941, 'unique_labels': 2, 'labels': {'1': {'count': 847}, '0': {'count': 1153}}}, 'fr': {'num_samples': 2000, 'number_of_characters': 478510, 'min_sentence1_length': 2, 'avg_sentence1_length': 119.5, 'max_sentence1_length': 233, 'unique_sentence1': 1933, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.75, 'max_sentence2_length': 246, 'unique_sentence2': 1939, 'unique_labels': 2, 'labels': {'1': {'count': 860}, '0': {'count': 1140}}}, 'ja': {'num_samples': 2000, 'number_of_characters': 229655, 'min_sentence1_length': 2, 'avg_sentence1_length': 57.51, 'max_sentence1_length': 126, 'unique_sentence1': 1957, 'min_sentence2_length': 2, 'avg_sentence2_length': 57.32, 'max_sentence2_length': 121, 'unique_sentence2': 1969, 'unique_labels': 2, 'labels': {'1': {'count': 854}, '0': {'count': 1146}}}, 'ko': {'num_samples': 2000, 'number_of_characters': 261355, 'min_sentence1_length': 2, 'avg_sentence1_length': 65.16, 'max_sentence1_length': 178, 'unique_sentence1': 1963, 'min_sentence2_length': 2, 'avg_sentence2_length': 65.52, 'max_sentence2_length': 174, 'unique_sentence2': 1968, 'unique_labels': 2, 'labels': {'1': {'count': 840}, '0': {'count': 1160}}}, 'zh': {'num_samples': 2000, 'number_of_characters': 169419, 'min_sentence1_length': 2, 'avg_sentence1_length': 42.45, 'max_sentence1_length': 101, 'unique_sentence1': 1899, 'min_sentence2_length': 2, 'avg_sentence2_length': 42.26, 'max_sentence2_length': 120, 'unique_sentence2': 1895, 'unique_labels': 2, 'labels': {'1': {'count': 853}, '0': {'count': 1147}}}}}} | | [PersianFoodSentimentClassification](https://hooshvare.github.io/docs/datasets/sa) (Mehrdad Farahani et al., 2020) | ['fas'] | Classification | s2s | [Reviews, Written] | None | None | +| [PersianTextEmotion](https://huggingface.co/datasets/SeyedAli/Persian-Text-Emotion) | ['fas'] | Classification | s2s | | None | None | +| [PersianTextTone](https://mcinext.com/) | ['fas'] | Classification | s2p | | None | None | +| [PersianWebDocumentRetrieval](https://ieeexplore.ieee.org/document/10553090) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [PersonalJurisdictionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [PhincBitextMining](https://huggingface.co/datasets/veezbo/phinc) (Srivastava et al., 2020) | ['eng', 'hin'] | BitextMining | s2s | [Social, Written] | {'train': 13738} | {'train': {'num_samples': 13738, 'number_of_characters': 2069457, 'unique_pairs': 13737, 'min_sentence1_length': 1, 'average_sentence1_length': 74.02, 'max_sentence1_length': 278, 'unique_sentence1': 13515, 'min_sentence2_length': 3, 'average_sentence2_length': 76.61, 'max_sentence2_length': 274, 'unique_sentence2': 13736, 'hf_subset_descriptive_stats': {'eng-eng_hin': {'num_samples': 13738, 'number_of_characters': 2069457, 'unique_pairs': 13737, 'min_sentence1_length': 1, 'average_sentence1_length': 74.02, 'max_sentence1_length': 278, 'unique_sentence1': 13515, 'min_sentence2_length': 3, 'average_sentence2_length': 76.61, 'max_sentence2_length': 274, 'unique_sentence2': 13736}}}} | | [PlscClusteringP2P.v2](https://huggingface.co/datasets/rafalposwiata/plsc) | ['pol'] | Clustering | s2s | [Academic, Written] | None | None | | [PlscClusteringS2S.v2](https://huggingface.co/datasets/rafalposwiata/plsc) | ['pol'] | Clustering | s2s | [Academic, Written] | None | None | | [PoemSentimentClassification](https://arxiv.org/abs/2011.02686) (Emily Sheng, 2020) | ['eng'] | Classification | s2s | [Reviews, Written] | None | None | -| [PolEmo2.0-IN](https://aclanthology.org/K19-1092.pdf) | ['pol'] | Classification | s2s | [Written, Social] | None | None | -| [PolEmo2.0-OUT](https://aclanthology.org/K19-1092.pdf) | ['pol'] | Classification | s2s | [Written, Social] | None | None | -| [PpcPC](https://arxiv.org/pdf/2207.12759.pdf) (Sล‚awomir Dadas, 2022) | ['pol'] | PairClassification | s2s | [Fiction, Non-fiction, Web, Written, Spoken, Social, News] | None | None | -| [PublicHealthQA](https://huggingface.co/datasets/xhluca/publichealth-qa) | ['ara', 'eng', 'fra', 'kor', 'rus', 'spa', 'vie', 'zho'] | Retrieval | s2p | [Medical, Government, Web, Written] | None | None | +| [PolEmo2.0-IN](https://aclanthology.org/K19-1092.pdf) | ['pol'] | Classification | s2s | [Social, Written] | None | None | +| [PolEmo2.0-OUT](https://aclanthology.org/K19-1092.pdf) | ['pol'] | Classification | s2s | [Social, Written] | None | None | +| [PpcPC](https://arxiv.org/pdf/2207.12759.pdf) (Sล‚awomir Dadas, 2022) | ['pol'] | PairClassification | s2s | [Fiction, News, Non-fiction, Social, Spoken, Web, Written] | None | None | +| [PubChemAISentenceParaphrasePC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | s2s | [Chemistry] | None | None | +| [PubChemSMILESBitextMining](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | BitextMining | s2s | [Chemistry] | None | None | +| [PubChemSMILESPC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | s2s | [Chemistry] | None | None | +| [PubChemSynonymPC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | s2s | [Chemistry] | None | None | +| [PubChemWikiPairClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['ces', 'deu', 'eng', 'fra', 'hin', 'jpn', 'kor', 'msa', 'nld', 'por', 'spa', 'tur', 'zho'] | PairClassification | s2s | [Chemistry] | None | None | +| [PubChemWikiParagraphsPC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | p2p | [Chemistry] | None | None | +| [PublicHealthQA](https://huggingface.co/datasets/xhluca/publichealth-qa) | ['ara', 'eng', 'fra', 'kor', 'rus', 'spa', 'vie', 'zho'] | Retrieval | s2p | [Government, Medical, Web, Written] | None | None | | [PunjabiNewsClassification](https://github.com/goru001/nlp-for-punjabi/) (Anoop Kunchukuttan, 2020) | ['pan'] | Classification | s2s | [News, Written] | None | None | | [QBQTC](https://github.com/CLUEbenchmark/QBQTC/tree/main/dataset) | ['cmn'] | STS | s2s | | None | None | -<<<<<<< HEAD -| [Quail](https://text-machine.cs.uml.edu/lab2/projects/quail/) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | {'test': 2720} | {'test': {'average_document_length': 27.50788422240522, 'average_query_length': 1957.3632352941177, 'num_documents': 32787, 'num_queries': 2720, 'average_relevant_docs_per_query': 1.0}} | -| [Quora-PL](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2s | | None | {'validation': {'average_document_length': 65.82473022253414, 'average_query_length': 54.6006, 'num_documents': 522931, 'num_queries': 5000, 'average_relevant_docs_per_query': 1.5252}, 'test': {'average_document_length': 65.82473022253414, 'average_query_length': 54.5354, 'num_documents': 522931, 'num_queries': 10000, 'average_relevant_docs_per_query': 1.5675}} | -| [Quora-PLHardNegatives](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2s | | {'test': 1000} | {'test': {'average_document_length': 67.77529631287385, 'average_query_length': 53.846, 'num_documents': 172031, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.641}} | -| [QuoraRetrieval](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | | None | {'dev': {'average_document_length': 62.158154708747425, 'average_query_length': 51.5342, 'num_documents': 522931, 'num_queries': 5000, 'average_relevant_docs_per_query': 1.5252}, 'test': {'average_document_length': 62.158154708747425, 'average_query_length': 51.5396, 'num_documents': 522931, 'num_queries': 10000, 'average_relevant_docs_per_query': 1.5675}} | -| [QuoraRetrievalHardNegatives](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | | {'test': 1000} | {'test': {'average_document_length': 58.96963812985781, 'average_query_length': 51.228, 'num_documents': 177163, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.641}} | -| [RARbCode](https://arxiv.org/abs/2404.06347) (Xiao et al., 2024) | ['eng'] | Retrieval | s2p | [Programming, Written] | {'test': 1484} | {'test': {'average_document_length': 793.6813076734267, 'average_query_length': 375.7506738544474, 'num_documents': 301482, 'num_queries': 1484, 'average_relevant_docs_per_query': 1.0}} | -| [RARbMath](https://arxiv.org/abs/2404.06347) (Xiao et al., 2024) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | {'test': 6319} | {'test': {'average_document_length': 504.0197829347469, 'average_query_length': 210.30732710871973, 'num_documents': 389376, 'num_queries': 6319, 'average_relevant_docs_per_query': 1.0}} | -| [RTE3](https://aclanthology.org/W07-1401/) | ['deu', 'eng', 'fra', 'ita'] | PairClassification | s2s | [News, Web, Encyclopaedic, Written] | {'test': 1923} | {'test': 124.79} | -| [RUParaPhraserSTS](https://aclanthology.org/2020.ngt-1.6) (Pivovarova et al., 2017) | ['rus'] | STS | s2s | [News, Written] | {'test': 1924} | {'test': 61.25} | -| [RedditClustering.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | s2s | [Web, Social, Written] | {'test': 32768} | {'test': 64.7} | -| [RedditClusteringP2P.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | p2p | [Web, Social, Written] | {'test': 18375} | {'test': 727.7} | -| [RestaurantReviewSentimentClassification](https://link.springer.com/chapter/10.1007/978-3-319-18117-2_2) (ElSahar et al., 2015) | ['ara'] | Classification | s2s | [Reviews, Written] | {'train': 2048} | {'train': 231.4} | -| [RiaNewsRetrieval](https://arxiv.org/abs/1901.07786) (Gavrilov et al., 2019) | ['rus'] | Retrieval | s2p | [News, Written] | {'test': 10000} | {'test': {'average_document_length': 1165.6429557148213, 'average_query_length': 62.4029, 'num_documents': 704344, 'num_queries': 10000, 'average_relevant_docs_per_query': 1.0}} | -| [RiaNewsRetrievalHardNegatives](https://arxiv.org/abs/1901.07786) (Gavrilov et al., 2019) | ['rus'] | Retrieval | s2p | [News, Written] | {'test': 1000} | {'test': {'average_document_length': 1225.7253146619116, 'average_query_length': 62.338, 'num_documents': 191237, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}} | -| [Robust04InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionRetrieval | s2p | [News, Written] | {'eng': 95088} | {'eng': 2471.0398058252426} | -| [RomaTalesBitextMining](https://idoc.pub/documents/idocpub-zpnxm9g35ylv) | ['hun', 'rom'] | BitextMining | s2s | [Fiction, Written] | {'test': 215} | {'test': 316.8046511627907} | -| [RomaniBibleClustering](https://romani.global.bible/info) | ['rom'] | Clustering | p2p | [Religious, Written] | {'test': 2048} | {'test': 132.2} | -| [RomanianReviewsSentiment](https://arxiv.org/abs/2101.04197) (Anca Maria Tache, 2021) | ['ron'] | Classification | s2s | [Reviews, Written] | {'test': 2048} | {'test': 588.6} | -| [RomanianSentimentClassification](https://arxiv.org/abs/2009.08712) (Dumitrescu et al., 2020) | ['ron'] | Classification | s2s | [Reviews, Written] | {'test': 2048} | {'test': 67.6} | -| [RonSTS](https://openreview.net/forum?id=JH61CD7afTv) (Dumitrescu et al., 2021) | ['ron'] | STS | s2s | [News, Social, Web, Written] | {'test': 1379} | {'test': 60.5} | -| [RuBQReranking](https://openreview.net/pdf?id=P5UQFFoQ4PJ) (Ivan Rybin, 2021) | ['rus'] | Reranking | s2p | [Encyclopaedic, Written] | {'test': 1551} | {'test': 499.9} | -| [RuBQRetrieval](https://openreview.net/pdf?id=P5UQFFoQ4PJ) (Ivan Rybin, 2021) | ['rus'] | Retrieval | s2p | [Encyclopaedic, Written] | {'test': 2845} | {'test': {'average_document_length': 448.94659134903037, 'average_query_length': 45.29609929078014, 'num_documents': 56826, 'num_queries': 1692, 'average_relevant_docs_per_query': 1.6814420803782506}} | -| [RuReviewsClassification](https://github.com/sismetanin/rureviews) (Sergey Smetanin, 2019) | ['rus'] | Classification | p2p | [Reviews, Written] | {'test': 2048} | {'test': 133.2} | -| [RuSTSBenchmarkSTS](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['rus'] | STS | s2s | [News, Social, Web, Written] | {'test': 1264} | {'test': 54.2} | -| [RuSciBenchGRNTIClassification](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Classification | p2p | [Academic, Written] | {'test': 2048} | {'test': 890.1} | -| [RuSciBenchGRNTIClusteringP2P](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Clustering | p2p | [Academic, Written] | {'test': 2048} | {'test': {'num_samples': 2048, 'average_text_length': 889.81396484375, 'average_labels_per_text': 1.0, 'unique_labels': 28, 'labels': {'3': {'count': 73}, '4': {'count': 73}, '20': {'count': 73}, '9': {'count': 73}, '21': {'count': 73}, '15': {'count': 73}, '16': {'count': 74}, '2': {'count': 73}, '8': {'count': 73}, '23': {'count': 73}, '6': {'count': 73}, '24': {'count': 73}, '10': {'count': 73}, '1': {'count': 73}, '17': {'count': 74}, '14': {'count': 74}, '18': {'count': 73}, '27': {'count': 73}, '19': {'count': 73}, '22': {'count': 73}, '12': {'count': 73}, '25': {'count': 73}, '5': {'count': 74}, '0': {'count': 73}, '26': {'count': 73}, '11': {'count': 73}, '13': {'count': 73}, '7': {'count': 73}}}} | -| [RuSciBenchOECDClassification](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Classification | p2p | [Academic, Written] | {'test': 2048} | {'test': 838.9} | -| [RuSciBenchOECDClusteringP2P](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Clustering | p2p | [Academic, Written] | {'test': 2048} | {'test': 838.9} | -| [SCDBPAccountabilityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 379} | {'test': 3520} | -| [SCDBPAuditsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 379} | {'test': 3507} | -| [SCDBPCertificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 378} | {'test': 3507} | -| [SCDBPTrainingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 379} | {'test': 3506} | -| [SCDBPVerificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 379} | {'test': 3498} | -| [SCDDAccountabilityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 378} | {'test': 3522} | -| [SCDDAuditsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 379} | {'test': 3506} | -| [SCDDCertificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 378} | {'test': 3518} | -| [SCDDTrainingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 379} | {'test': 3499} | -| [SCDDVerificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 379} | {'test': 3503} | -| [SCIDOCS](https://allenai.org/data/scidocs) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Written, Non-fiction] | None | {'test': {'average_document_length': 1203.3659819932182, 'average_query_length': 71.632, 'num_documents': 25657, 'num_queries': 1000, 'average_relevant_docs_per_query': 4.928}} | -| [SCIDOCS-PL](https://allenai.org/data/scidocs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | {'test': {'average_document_length': 1270.0791986592353, 'average_query_length': 80.671, 'num_documents': 25657, 'num_queries': 1000, 'average_relevant_docs_per_query': 4.928}} | -| [SIB200Classification](https://arxiv.org/abs/2309.07445) (Adelani et al., 2023) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nqo', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | Classification | s2s | [News, Written] | {'train': 701, 'validation': 99, 'test': 204} | {'train': 111.24, 'validation': 97.11, 'test': 135.53} | -| [SIB200ClusteringS2S](https://arxiv.org/abs/2309.07445) (Adelani et al., 2023) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nqo', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | Clustering | s2s | [News, Written] | {'test': 1004} | {'test': 114.78} | -| [SICK-BR-PC](https://linux.ime.usp.br/~thalen/SICK_PT.pdf) | ['por'] | PairClassification | s2s | [Web, Written] | {'test': 1000} | {'test': 54.89} | -| [SICK-BR-STS](https://linux.ime.usp.br/~thalen/SICK_PT.pdf) | ['por'] | STS | s2s | [Web, Written] | {'test': 1000} | {'test': 54.89} | -======= | [Quail](https://text-machine.cs.uml.edu/lab2/projects/quail/) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | +| [Query2Query](https://mcinext.com/) | ['fas'] | STS | s2s | | None | None | | [Quora-PL](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2s | | None | None | | [Quora-PLHardNegatives](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2s | | None | None | -| [QuoraRetrieval](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | | None | None | +| [QuoraRetrieval](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | [Blog, Web, Written] | None | None | +| [QuoraRetrieval-Fa](https://huggingface.co/datasets/MCINext/quora-fa) | ['fas'] | Retrieval | s2s | [Web] | None | None | | [QuoraRetrievalHardNegatives](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | | None | None | | [RARbCode](https://arxiv.org/abs/2404.06347) (Xiao et al., 2024) | ['eng'] | Retrieval | s2p | [Programming, Written] | None | None | | [RARbMath](https://arxiv.org/abs/2404.06347) (Xiao et al., 2024) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | -| [RTE3](https://aclanthology.org/W07-1401/) | ['deu', 'eng', 'fra', 'ita'] | PairClassification | s2s | [News, Web, Encyclopaedic, Written] | None | None | +| [RESISC45](https://ieeexplore.ieee.org/abstract/document/7891544) (Cheng et al., 2017) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [RESISC45ZeroShot](https://ieeexplore.ieee.org/abstract/document/7891544) (Cheng et al., 2017) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | +| [ROxfordEasyI2IMultiChoice](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyMultiChoice | i2i | [Web] | None | None | +| [ROxfordEasyI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | None | None | +| [ROxfordHardI2IMultiChoice](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyMultiChoice | i2i | [Web] | None | None | +| [ROxfordHardI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | None | None | +| [ROxfordMediumI2IMultiChoice](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyMultiChoice | i2i | [Web] | None | None | +| [ROxfordMediumI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | None | None | +| [RP2kI2IRetrieval](https://arxiv.org/abs/2006.12634) (Peng et al., 2020) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | None | None | +| [RParisEasyI2IMultiChoice](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyMultiChoice | i2i | [Web] | None | None | +| [RParisEasyI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | None | None | +| [RParisHardI2IMultiChoice](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyMultiChoice | i2i | [Web] | None | None | +| [RParisHardI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | None | None | +| [RParisMediumI2IMultiChoice](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyMultiChoice | i2i | [Web] | None | None | +| [RParisMediumI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | None | None | +| [RTE3](https://aclanthology.org/W07-1401/) | ['deu', 'eng', 'fra', 'ita'] | PairClassification | s2s | [Encyclopaedic, News, Web, Written] | None | None | | [RUParaPhraserSTS](https://aclanthology.org/2020.ngt-1.6) (Pivovarova et al., 2017) | ['rus'] | STS | s2s | [News, Written] | None | None | -| [RedditClustering.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | s2s | [Web, Social, Written] | None | None | -| [RedditClusteringP2P.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | p2p | [Web, Social, Written] | {'test': 459389} | {'test': {'num_samples': 459389, 'number_of_characters': 334286895, 'min_text_length': 79, 'average_text_length': 727.68, 'max_text_length': 4359, 'min_labels_per_text': 2, 'average_labels_per_text': 1.0, 'max_labels_per_text': 77908, 'unique_labels': 440, 'labels': {'FortNiteBR': {'count': 436}, 'buildapc': {'count': 8484}, 'offmychest': {'count': 570}, 'nus': {'count': 45}, 'relationship_advice': {'count': 16651}, 'premed': {'count': 201}, 'dogecoin': {'count': 8108}, 'GamingLaptops': {'count': 183}, 'asktransgender': {'count': 326}, 'MachineLearning': {'count': 61}, 'puppy101': {'count': 1597}, 'GunAccessoriesForSale': {'count': 2619}, 'Random_Acts_Of_Amazon': {'count': 1115}, 'Catholicism': {'count': 183}, 'MonsterHunter': {'count': 218}, 'tipofmypenis': {'count': 87}, 'samsung': {'count': 69}, 'PersonalFinanceCanada': {'count': 341}, 'Dyson_Sphere_Program': {'count': 55}, 'bleach': {'count': 41}, 'AmItheAsshole': {'count': 3730}, 'WallStreetbetsELITE': {'count': 328}, 'GlobalPowers': {'count': 35}, 'ABraThatFits': {'count': 159}, 'PokemonGoFriends': {'count': 1165}, 'NoMansSkyTheGame': {'count': 259}, 'masseffect': {'count': 233}, 'dating_advice': {'count': 559}, 'yoga': {'count': 50}, 'depression': {'count': 515}, 'COVID19positive': {'count': 180}, 'generationology': {'count': 37}, 'feedthebeast': {'count': 192}, 'EliteDangerous': {'count': 270}, 'alcoholicsanonymous': {'count': 93}, 'GoRVing': {'count': 35}, 'thedivision': {'count': 111}, 'breakingmom': {'count': 105}, 'AskAnAmerican': {'count': 80}, 'HypnoFair': {'count': 5}, 'JustUnsubbed': {'count': 13}, 'socialanxiety': {'count': 123}, 'dirtykikpals': {'count': 202}, 'askTO': {'count': 126}, 'AskCulinary': {'count': 108}, 'Bogleheads': {'count': 71}, 'dragonquest': {'count': 45}, 'NoContract': {'count': 30}, 'gorillaz': {'count': 14}, 'MondoGore': {'count': 8}, 'comicswap': {'count': 56}, 'VirtualYoutubers': {'count': 92}, 'Gta5Modding': {'count': 28}, 'obs': {'count': 61}, 'vcu': {'count': 9}, 'KingkillerChronicle': {'count': 17}, 'AmongUs': {'count': 41}, 'wireshark': {'count': 3}, 'Dodocodes': {'count': 46}, 'Aliexpress': {'count': 40}, 'LearnerDriverUK': {'count': 12}, 'PanicAttack': {'count': 23}, 'KassadinMains': {'count': 10}, 'islam': {'count': 93}, 'chronotrigger': {'count': 4}, 'skincareexchange': {'count': 13}, 'PokemonHome': {'count': 21}, 'survivinginfidelity': {'count': 71}, 'igcse': {'count': 21}, 'C25K': {'count': 21}, 'aorus': {'count': 2}, 'idleon': {'count': 19}, 'photography': {'count': 22}, 'cryptocoins': {'count': 7}, 'CanaryWharfBets': {'count': 7}, 'KillingEve': {'count': 7}, 'GameBuilderGarage': {'count': 16}, 'SauceSharingCommunity': {'count': 7}, 'turo': {'count': 9}, 'foodscience': {'count': 14}, 'HIMYM': {'count': 20}, 'HauntingOfHillHouse': {'count': 4}, 'GoodNotes': {'count': 8}, 'RedditWritesSeinfeld': {'count': 6}, 'AirReps': {'count': 2}, 'ADHD': {'count': 3811}, 'BuddyCrossing': {'count': 446}, 'libraryofruina': {'count': 98}, 'SluttyConfessions': {'count': 2787}, 'tipofmytongue': {'count': 7145}, 'fleshlight': {'count': 128}, 'amcstock': {'count': 13910}, 'teenagers': {'count': 77908}, 'suggestmeabook': {'count': 1540}, 'dirtypenpals': {'count': 5587}, 'MinecraftServer': {'count': 177}, 'CreditCards': {'count': 669}, 'Guitar': {'count': 10952}, 'rpg': {'count': 529}, 'NoFap': {'count': 14853}, 'lfg': {'count': 1093}, 'MarsWallStreet': {'count': 935}, 'SummonSign': {'count': 931}, 'AssassinsCreedValhala': {'count': 295}, 'hoi4': {'count': 432}, 'Coins4Sale': {'count': 260}, 'xbox': {'count': 459}, 'TooAfraidToAsk': {'count': 7404}, 'NBA2k': {'count': 553}, 'KGBTR': {'count': 943}, 'roblox': {'count': 220}, 'salesforce': {'count': 214}, 'TwoXChromosomes': {'count': 1736}, 'mechmarket': {'count': 4863}, 'Gaming_Headsets': {'count': 103}, 'pittsburgh': {'count': 189}, 'CryptoMars': {'count': 1606}, 'FridayNightFunkin': {'count': 378}, 'vaginismus': {'count': 122}, 'transpositive': {'count': 10}, 'comicbooks': {'count': 274}, 'BDSMcommunity': {'count': 185}, 'aliens': {'count': 201}, 'Scotch': {'count': 64}, 'KikRoleplay': {'count': 141}, 'Kayaking': {'count': 91}, '196': {'count': 47}, 'digimon': {'count': 140}, 'Evernote': {'count': 42}, 'logh': {'count': 22}, 'arlington': {'count': 15}, 'Adopted': {'count': 8}, 'DissonautUniverse': {'count': 4}, 'Midsommar': {'count': 12}, 'SofiawithanF': {'count': 83}, 'xmpp': {'count': 6}, 'ZombsRoyale': {'count': 16}, 'accesscontrol': {'count': 8}, 'WetlanderHumor': {'count': 2}, 'PoonamPandeyFanatics': {'count': 2}, 'screenplaychallenge': {'count': 2}, 'scatstories': {'count': 2}, 'techsupport': {'count': 290}, 'whatcarshouldIbuy': {'count': 79}, 'Stormlight_Archive': {'count': 15}, 'deadbydaylight': {'count': 126}, 'bicycling': {'count': 27}, 'oculus': {'count': 64}, 'Cartalk': {'count': 33}, 'Sims4': {'count': 43}, 'NoFeeAC': {'count': 95}, 'Crypto_com': {'count': 37}, 'ITCareerQuestions': {'count': 259}, 'aromantic': {'count': 18}, 'Revu': {'count': 3}, 'exalted': {'count': 2}, 'HilariaBaldwin': {'count': 20}, 'Testosterone': {'count': 35}, 'Screenwriting': {'count': 170}, 'LifeProTips': {'count': 49}, 'steinsgate': {'count': 13}, 'Baystreetbets': {'count': 10}, 'AskGirls': {'count': 7}, 'idlechampions': {'count': 7}, 'facebook': {'count': 17}, 'tf2trade': {'count': 4}, 'mfdoom': {'count': 3}, 'FiddlesticksMains': {'count': 2}, 'HFY': {'count': 10}, 'FiestaST': {'count': 2}, 'whatsthatbook': {'count': 994}, 'GearsOfWar': {'count': 879}, 'KazuhaMains': {'count': 175}, 'RepTime': {'count': 211}, 'AstroGaming': {'count': 141}, 'metalgearsolid': {'count': 152}, 'qBittorrent': {'count': 39}, 'ELLIPAL_Official': {'count': 24}, 'raisedbynarcissists': {'count': 4895}, 'unpopularopinion': {'count': 14901}, 'ACTrade': {'count': 5679}, 'askcarsales': {'count': 1339}, 'AskVet': {'count': 1357}, 'whowouldwin': {'count': 4493}, 'playstation': {'count': 1362}, 'anime': {'count': 6531}, 'GME': {'count': 12577}, 'DotA2': {'count': 2004}, 'cryptostreetbets': {'count': 2241}, 'MonsterHunterWorld': {'count': 698}, 'Market76': {'count': 14274}, 'DnD': {'count': 5092}, 'leagueoflegends': {'count': 3683}, 'doordash_drivers': {'count': 1626}, 'theta_network': {'count': 489}, 'exmuslim': {'count': 1369}, 'gonewildaudio': {'count': 2998}, 'conspiracy': {'count': 3587}, 'heroesofthestorm': {'count': 535}, 'FanFiction': {'count': 2782}, 'Doom': {'count': 1251}, 'texas': {'count': 269}, 'Vent': {'count': 1738}, 'selfimprovement': {'count': 1284}, 'youtubers': {'count': 706}, 'askseddit': {'count': 237}, 'boardgames': {'count': 1237}, 'bravelydefault': {'count': 347}, 'ConquerorsBlade': {'count': 238}, 'ChronicPain': {'count': 527}, 'teenagersnew': {'count': 256}, 'brasil': {'count': 1092}, 'MatthiasSubmissions': {'count': 921}, 'MarylandUnemployment': {'count': 314}, 'SaltLakeCity': {'count': 411}, 'BokunoheroFanfiction': {'count': 155}, 'BenignExistence': {'count': 125}, 'GayYoungOldDating': {'count': 156}, 'Bible': {'count': 202}, 'haskell': {'count': 154}, 'seduction': {'count': 400}, 'fantasywriters': {'count': 262}, 'HiveOS': {'count': 100}, 'PerkByDaylight': {'count': 15}, 'Hedgehog': {'count': 73}, 'xmen': {'count': 263}, 'HyperRP': {'count': 122}, 'emotestories': {'count': 3}, 'tutanota': {'count': 135}, 'CultoftheFranklin': {'count': 46}, 'langrisser': {'count': 62}, 'CozyGrove': {'count': 61}, 'Sverigesforsvarsmakt': {'count': 12}, 'silverbugbets': {'count': 21}, 'WreckingBallMains': {'count': 5}, 'capitalism_in_decay': {'count': 8}, 'paintdotnet': {'count': 11}, 'u_mawadom118': {'count': 4}, 'xboxfindfriends': {'count': 2}, 'CPTSD': {'count': 540}, 'destiny2': {'count': 318}, 'Wallstreetsilver': {'count': 1013}, 'DestinyTheGame': {'count': 1107}, 'blackopscoldwar': {'count': 400}, 'InstacartShoppers': {'count': 202}, 'RocketLeagueExchange': {'count': 832}, 'apexlegends': {'count': 3265}, 'kansascity': {'count': 53}, 'namenerds': {'count': 235}, 'help': {'count': 152}, 'Kengan_Ashura': {'count': 132}, 'thetagang': {'count': 165}, 'GameSale': {'count': 262}, 'Reduction': {'count': 109}, 'sex': {'count': 906}, 'bostonr4r': {'count': 75}, 'LegendsOfRuneterra': {'count': 231}, 'overlord': {'count': 48}, 'madisonwi': {'count': 53}, 'steelseries': {'count': 79}, 'ClashOfClansRecruit': {'count': 214}, 'CharacterRant': {'count': 55}, 'AirForce': {'count': 94}, 'sexstories': {'count': 92}, 'NameThatSong': {'count': 162}, 'depressed': {'count': 74}, 'ibs': {'count': 150}, '40kLore': {'count': 269}, 'podcasts': {'count': 88}, 'miraculousladybug': {'count': 150}, 'ask': {'count': 224}, 'EverMerge': {'count': 31}, 'TMJ': {'count': 54}, 'BitLifeApp': {'count': 39}, 'FireEmblemHeroes': {'count': 100}, 'software': {'count': 62}, 'ShieldAndroidTV': {'count': 70}, 'GriefSupport': {'count': 125}, 'onewheel': {'count': 37}, 'MensRights': {'count': 80}, 'nhl': {'count': 22}, 'ClashOfClans': {'count': 107}, 'ps3homebrew': {'count': 33}, 'LightNovels': {'count': 77}, 'redsox': {'count': 34}, 'CryptoMarkets': {'count': 44}, 'ugly': {'count': 47}, 'GCXRep': {'count': 12}, 'cscareerquestionsEU': {'count': 65}, 'MindHunter': {'count': 6}, 'starcraft2coop': {'count': 15}, 'nanocurrency': {'count': 1421}, 'ModelCars': {'count': 8}, 'UKJobs': {'count': 30}, 'Netherlands': {'count': 44}, 'clonewars': {'count': 8}, 'Julia': {'count': 11}, 'Prolactinoma': {'count': 9}, 'sofi': {'count': 11}, 'royalfamily': {'count': 6}, 'ConnecticutR4R': {'count': 8}, 'weather': {'count': 5}, 'oneui': {'count': 7}, 'KTM': {'count': 5}, 'Aerials': {'count': 3}, 'seoul': {'count': 2}, 'exjw': {'count': 3281}, 'ModernMagic': {'count': 699}, 'Paladins': {'count': 1242}, 'kdramarecommends': {'count': 1611}, 'hitbtc': {'count': 330}, 'endocrinology': {'count': 75}, 'Bath': {'count': 43}, 'NassauCountyHookups': {'count': 5}, 'feminineboys': {'count': 1248}, 'dreamsmp': {'count': 2018}, 'SquaredCircle': {'count': 2255}, 'Minecraft': {'count': 8753}, 'spirituality': {'count': 1809}, 'Eldenring': {'count': 1471}, 'Sat': {'count': 1172}, 'bonnaroo': {'count': 194}, 'gardening': {'count': 1892}, 'Unemployment': {'count': 6185}, 'mac': {'count': 1847}, 'Bestbuy': {'count': 437}, 'quittingkratom': {'count': 1081}, 'lawschooladmissions': {'count': 3436}, 'NiceHash': {'count': 2135}, 'McMaster': {'count': 815}, 'covidlonghaulers': {'count': 1299}, 'stalker': {'count': 758}, 'MLBTheShow': {'count': 2721}, 'FortniteCompetitive': {'count': 998}, 'dpdr': {'count': 514}, 'appliancerepair': {'count': 720}, 'thomasthetankengine': {'count': 207}, 'delhi': {'count': 217}, 'Huel': {'count': 300}, 'leafs': {'count': 203}, 'HotWheels': {'count': 170}, '90dayfianceuncensored': {'count': 550}, 'Throwers': {'count': 142}, 'Wavyhair': {'count': 270}, 'CryptoHorde': {'count': 128}, 'ShuumatsuNoValkyrie': {'count': 453}, 'TeensMeetTeens': {'count': 432}, 'dbrand': {'count': 108}, 'SLFmeetups': {'count': 18}, '1200isplentyketo': {'count': 48}, 'passive_income': {'count': 211}, 'BroadCity': {'count': 16}, 'RevenantMain': {'count': 71}, 'extrarfl': {'count': 25}, 'AgonGame': {'count': 5}, 'FitnessDE': {'count': 3}, 'gaming': {'count': 1277}, 'livesound': {'count': 91}, 'IBO': {'count': 1896}, 'EscapefromTarkov': {'count': 1300}, 'amex': {'count': 145}, 'DMAcademy': {'count': 1411}, 'VinylCollectors': {'count': 556}, 'cardano': {'count': 716}, 'brave_browser': {'count': 159}, 'dating': {'count': 952}, 'OculusQuest': {'count': 942}, 'Superstonk': {'count': 3089}, 'MtF': {'count': 957}, 'findaleague': {'count': 207}, 'Nioh': {'count': 398}, 'IRS': {'count': 715}, 'transgendercirclejerk': {'count': 353}, 'learnmath': {'count': 489}, 'piano': {'count': 263}, 'LeagueConnect': {'count': 216}, 'eu4': {'count': 561}, 'Wordpress': {'count': 345}, 'RoleplayingForReddit': {'count': 31}, 'LOONA': {'count': 89}, 'newtothenavy': {'count': 167}, 'HaircareScience': {'count': 118}, 'appletv': {'count': 167}, 'sissypersonals': {'count': 102}, 'raleigh': {'count': 168}, 'realonlyfansreviews': {'count': 21}, 'AskGames': {'count': 49}, 'PokemonTCG': {'count': 325}, 'controlgame': {'count': 109}, 'GoogleDataStudio': {'count': 16}, 'WhiteWolfRPG': {'count': 139}, 'MECoOp': {'count': 31}, 'snuffrp': {'count': 46}, 'lockpicking': {'count': 103}, 'wicked_edge': {'count': 105}, 'BMW': {'count': 99}, 'choiceofgames': {'count': 24}, 'hisdarkmaterials': {'count': 12}, 'SakuraGakuin': {'count': 24}, 'detrans': {'count': 55}, 'Smallville': {'count': 37}, 'kingofqueens': {'count': 7}, 'JamesHoffmann': {'count': 22}, 'stashinvest': {'count': 16}, 'ABA': {'count': 79}, 'ladybusiness': {'count': 10}, 'gamegrumps': {'count': 32}, 'GodEater': {'count': 21}, 'tomorrow': {'count': 39}, 'Tomorrowland': {'count': 9}, 'BlackCountryNewRoad': {'count': 5}, 'STAYC': {'count': 3}, 'SatoshiStreetBets': {'count': 3828}, 'AskLosAngeles': {'count': 1036}, 'buildapcforme': {'count': 1689}, 'ApplyingToCollege': {'count': 10675}, 'watercooling': {'count': 1209}, 'BreakUps': {'count': 4914}, 'FIFA': {'count': 3811}, 'emacs': {'count': 712}, 'trakstocks': {'count': 691}, 'Shittyaskflying': {'count': 147}, 'AmazonFC': {'count': 1178}, 'stocks': {'count': 4610}, 'BangaloreMains': {'count': 26}, 'pokemon': {'count': 3953}, 'religion': {'count': 684}, 'cuboulder': {'count': 269}, 'self': {'count': 1688}, 'tarot': {'count': 912}, 'turtles': {'count': 49}, 'TheMagnusArchives': {'count': 300}, 'Superhero_Ideas': {'count': 34}, 'NTU': {'count': 308}, 'touhou': {'count': 623}, 'JoJolion': {'count': 50}, 'lasers': {'count': 27}, 'popperpigs': {'count': 67}, 'aggretsuko': {'count': 20}, 'Library': {'count': 5}}}} | +| [ReMuQIT2TRetrieval](https://github.com/luomancs/ReMuQ) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | None | None | +| [RedditClustering.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | s2s | [Social, Web, Written] | None | None | +| [RedditClusteringP2P.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | p2p | [Social, Web, Written] | {'test': 459389} | {'test': {'num_samples': 459389, 'number_of_characters': 334286895, 'min_text_length': 79, 'average_text_length': 727.68, 'max_text_length': 4359, 'min_labels_per_text': 2, 'average_labels_per_text': 1.0, 'max_labels_per_text': 77908, 'unique_labels': 440, 'labels': {'FortNiteBR': {'count': 436}, 'buildapc': {'count': 8484}, 'offmychest': {'count': 570}, 'nus': {'count': 45}, 'relationship_advice': {'count': 16651}, 'premed': {'count': 201}, 'dogecoin': {'count': 8108}, 'GamingLaptops': {'count': 183}, 'asktransgender': {'count': 326}, 'MachineLearning': {'count': 61}, 'puppy101': {'count': 1597}, 'GunAccessoriesForSale': {'count': 2619}, 'Random_Acts_Of_Amazon': {'count': 1115}, 'Catholicism': {'count': 183}, 'MonsterHunter': {'count': 218}, 'tipofmypenis': {'count': 87}, 'samsung': {'count': 69}, 'PersonalFinanceCanada': {'count': 341}, 'Dyson_Sphere_Program': {'count': 55}, 'bleach': {'count': 41}, 'AmItheAsshole': {'count': 3730}, 'WallStreetbetsELITE': {'count': 328}, 'GlobalPowers': {'count': 35}, 'ABraThatFits': {'count': 159}, 'PokemonGoFriends': {'count': 1165}, 'NoMansSkyTheGame': {'count': 259}, 'masseffect': {'count': 233}, 'dating_advice': {'count': 559}, 'yoga': {'count': 50}, 'depression': {'count': 515}, 'COVID19positive': {'count': 180}, 'generationology': {'count': 37}, 'feedthebeast': {'count': 192}, 'EliteDangerous': {'count': 270}, 'alcoholicsanonymous': {'count': 93}, 'GoRVing': {'count': 35}, 'thedivision': {'count': 111}, 'breakingmom': {'count': 105}, 'AskAnAmerican': {'count': 80}, 'HypnoFair': {'count': 5}, 'JustUnsubbed': {'count': 13}, 'socialanxiety': {'count': 123}, 'dirtykikpals': {'count': 202}, 'askTO': {'count': 126}, 'AskCulinary': {'count': 108}, 'Bogleheads': {'count': 71}, 'dragonquest': {'count': 45}, 'NoContract': {'count': 30}, 'gorillaz': {'count': 14}, 'MondoGore': {'count': 8}, 'comicswap': {'count': 56}, 'VirtualYoutubers': {'count': 92}, 'Gta5Modding': {'count': 28}, 'obs': {'count': 61}, 'vcu': {'count': 9}, 'KingkillerChronicle': {'count': 17}, 'AmongUs': {'count': 41}, 'wireshark': {'count': 3}, 'Dodocodes': {'count': 46}, 'Aliexpress': {'count': 40}, 'LearnerDriverUK': {'count': 12}, 'PanicAttack': {'count': 23}, 'KassadinMains': {'count': 10}, 'islam': {'count': 93}, 'chronotrigger': {'count': 4}, 'skincareexchange': {'count': 13}, 'PokemonHome': {'count': 21}, 'survivinginfidelity': {'count': 71}, 'igcse': {'count': 21}, 'C25K': {'count': 21}, 'aorus': {'count': 2}, 'idleon': {'count': 19}, 'photography': {'count': 22}, 'cryptocoins': {'count': 7}, 'CanaryWharfBets': {'count': 7}, 'KillingEve': {'count': 7}, 'GameBuilderGarage': {'count': 16}, 'SauceSharingCommunity': {'count': 7}, 'turo': {'count': 9}, 'foodscience': {'count': 14}, 'HIMYM': {'count': 20}, 'HauntingOfHillHouse': {'count': 4}, 'GoodNotes': {'count': 8}, 'RedditWritesSeinfeld': {'count': 6}, 'AirReps': {'count': 2}, 'ADHD': {'count': 3811}, 'BuddyCrossing': {'count': 446}, 'libraryofruina': {'count': 98}, 'SluttyConfessions': {'count': 2787}, 'tipofmytongue': {'count': 7145}, 'fleshlight': {'count': 128}, 'amcstock': {'count': 13910}, 'teenagers': {'count': 77908}, 'suggestmeabook': {'count': 1540}, 'dirtypenpals': {'count': 5587}, 'MinecraftServer': {'count': 177}, 'CreditCards': {'count': 669}, 'Guitar': {'count': 10952}, 'rpg': {'count': 529}, 'NoFap': {'count': 14853}, 'lfg': {'count': 1093}, 'MarsWallStreet': {'count': 935}, 'SummonSign': {'count': 931}, 'AssassinsCreedValhala': {'count': 295}, 'hoi4': {'count': 432}, 'Coins4Sale': {'count': 260}, 'xbox': {'count': 459}, 'TooAfraidToAsk': {'count': 7404}, 'NBA2k': {'count': 553}, 'KGBTR': {'count': 943}, 'roblox': {'count': 220}, 'salesforce': {'count': 214}, 'TwoXChromosomes': {'count': 1736}, 'mechmarket': {'count': 4863}, 'Gaming_Headsets': {'count': 103}, 'pittsburgh': {'count': 189}, 'CryptoMars': {'count': 1606}, 'FridayNightFunkin': {'count': 378}, 'vaginismus': {'count': 122}, 'transpositive': {'count': 10}, 'comicbooks': {'count': 274}, 'BDSMcommunity': {'count': 185}, 'aliens': {'count': 201}, 'Scotch': {'count': 64}, 'KikRoleplay': {'count': 141}, 'Kayaking': {'count': 91}, '196': {'count': 47}, 'digimon': {'count': 140}, 'Evernote': {'count': 42}, 'logh': {'count': 22}, 'arlington': {'count': 15}, 'Adopted': {'count': 8}, 'DissonautUniverse': {'count': 4}, 'Midsommar': {'count': 12}, 'SofiawithanF': {'count': 83}, 'xmpp': {'count': 6}, 'ZombsRoyale': {'count': 16}, 'accesscontrol': {'count': 8}, 'WetlanderHumor': {'count': 2}, 'PoonamPandeyFanatics': {'count': 2}, 'screenplaychallenge': {'count': 2}, 'scatstories': {'count': 2}, 'techsupport': {'count': 290}, 'whatcarshouldIbuy': {'count': 79}, 'Stormlight_Archive': {'count': 15}, 'deadbydaylight': {'count': 126}, 'bicycling': {'count': 27}, 'oculus': {'count': 64}, 'Cartalk': {'count': 33}, 'Sims4': {'count': 43}, 'NoFeeAC': {'count': 95}, 'Crypto_com': {'count': 37}, 'ITCareerQuestions': {'count': 259}, 'aromantic': {'count': 18}, 'Revu': {'count': 3}, 'exalted': {'count': 2}, 'HilariaBaldwin': {'count': 20}, 'Testosterone': {'count': 35}, 'Screenwriting': {'count': 170}, 'LifeProTips': {'count': 49}, 'steinsgate': {'count': 13}, 'Baystreetbets': {'count': 10}, 'AskGirls': {'count': 7}, 'idlechampions': {'count': 7}, 'facebook': {'count': 17}, 'tf2trade': {'count': 4}, 'mfdoom': {'count': 3}, 'FiddlesticksMains': {'count': 2}, 'HFY': {'count': 10}, 'FiestaST': {'count': 2}, 'whatsthatbook': {'count': 994}, 'GearsOfWar': {'count': 879}, 'KazuhaMains': {'count': 175}, 'RepTime': {'count': 211}, 'AstroGaming': {'count': 141}, 'metalgearsolid': {'count': 152}, 'qBittorrent': {'count': 39}, 'ELLIPAL_Official': {'count': 24}, 'raisedbynarcissists': {'count': 4895}, 'unpopularopinion': {'count': 14901}, 'ACTrade': {'count': 5679}, 'askcarsales': {'count': 1339}, 'AskVet': {'count': 1357}, 'whowouldwin': {'count': 4493}, 'playstation': {'count': 1362}, 'anime': {'count': 6531}, 'GME': {'count': 12577}, 'DotA2': {'count': 2004}, 'cryptostreetbets': {'count': 2241}, 'MonsterHunterWorld': {'count': 698}, 'Market76': {'count': 14274}, 'DnD': {'count': 5092}, 'leagueoflegends': {'count': 3683}, 'doordash_drivers': {'count': 1626}, 'theta_network': {'count': 489}, 'exmuslim': {'count': 1369}, 'gonewildaudio': {'count': 2998}, 'conspiracy': {'count': 3587}, 'heroesofthestorm': {'count': 535}, 'FanFiction': {'count': 2782}, 'Doom': {'count': 1251}, 'texas': {'count': 269}, 'Vent': {'count': 1738}, 'selfimprovement': {'count': 1284}, 'youtubers': {'count': 706}, 'askseddit': {'count': 237}, 'boardgames': {'count': 1237}, 'bravelydefault': {'count': 347}, 'ConquerorsBlade': {'count': 238}, 'ChronicPain': {'count': 527}, 'teenagersnew': {'count': 256}, 'brasil': {'count': 1092}, 'MatthiasSubmissions': {'count': 921}, 'MarylandUnemployment': {'count': 314}, 'SaltLakeCity': {'count': 411}, 'BokunoheroFanfiction': {'count': 155}, 'BenignExistence': {'count': 125}, 'GayYoungOldDating': {'count': 156}, 'Bible': {'count': 202}, 'haskell': {'count': 154}, 'seduction': {'count': 400}, 'fantasywriters': {'count': 262}, 'HiveOS': {'count': 100}, 'PerkByDaylight': {'count': 15}, 'Hedgehog': {'count': 73}, 'xmen': {'count': 263}, 'HyperRP': {'count': 122}, 'emotestories': {'count': 3}, 'tutanota': {'count': 135}, 'CultoftheFranklin': {'count': 46}, 'langrisser': {'count': 62}, 'CozyGrove': {'count': 61}, 'Sverigesforsvarsmakt': {'count': 12}, 'silverbugbets': {'count': 21}, 'WreckingBallMains': {'count': 5}, 'capitalism_in_decay': {'count': 8}, 'paintdotnet': {'count': 11}, 'u_mawadom118': {'count': 4}, 'xboxfindfriends': {'count': 2}, 'CPTSD': {'count': 540}, 'destiny2': {'count': 318}, 'Wallstreetsilver': {'count': 1013}, 'DestinyTheGame': {'count': 1107}, 'blackopscoldwar': {'count': 400}, 'InstacartShoppers': {'count': 202}, 'RocketLeagueExchange': {'count': 832}, 'apexlegends': {'count': 3265}, 'kansascity': {'count': 53}, 'namenerds': {'count': 235}, 'help': {'count': 152}, 'Kengan_Ashura': {'count': 132}, 'thetagang': {'count': 165}, 'GameSale': {'count': 262}, 'Reduction': {'count': 109}, 'sex': {'count': 906}, 'bostonr4r': {'count': 75}, 'LegendsOfRuneterra': {'count': 231}, 'overlord': {'count': 48}, 'madisonwi': {'count': 53}, 'steelseries': {'count': 79}, 'ClashOfClansRecruit': {'count': 214}, 'CharacterRant': {'count': 55}, 'AirForce': {'count': 94}, 'sexstories': {'count': 92}, 'NameThatSong': {'count': 162}, 'depressed': {'count': 74}, 'ibs': {'count': 150}, '40kLore': {'count': 269}, 'podcasts': {'count': 88}, 'miraculousladybug': {'count': 150}, 'ask': {'count': 224}, 'EverMerge': {'count': 31}, 'TMJ': {'count': 54}, 'BitLifeApp': {'count': 39}, 'FireEmblemHeroes': {'count': 100}, 'software': {'count': 62}, 'ShieldAndroidTV': {'count': 70}, 'GriefSupport': {'count': 125}, 'onewheel': {'count': 37}, 'MensRights': {'count': 80}, 'nhl': {'count': 22}, 'ClashOfClans': {'count': 107}, 'ps3homebrew': {'count': 33}, 'LightNovels': {'count': 77}, 'redsox': {'count': 34}, 'CryptoMarkets': {'count': 44}, 'ugly': {'count': 47}, 'GCXRep': {'count': 12}, 'cscareerquestionsEU': {'count': 65}, 'MindHunter': {'count': 6}, 'starcraft2coop': {'count': 15}, 'nanocurrency': {'count': 1421}, 'ModelCars': {'count': 8}, 'UKJobs': {'count': 30}, 'Netherlands': {'count': 44}, 'clonewars': {'count': 8}, 'Julia': {'count': 11}, 'Prolactinoma': {'count': 9}, 'sofi': {'count': 11}, 'royalfamily': {'count': 6}, 'ConnecticutR4R': {'count': 8}, 'weather': {'count': 5}, 'oneui': {'count': 7}, 'KTM': {'count': 5}, 'Aerials': {'count': 3}, 'seoul': {'count': 2}, 'exjw': {'count': 3281}, 'ModernMagic': {'count': 699}, 'Paladins': {'count': 1242}, 'kdramarecommends': {'count': 1611}, 'hitbtc': {'count': 330}, 'endocrinology': {'count': 75}, 'Bath': {'count': 43}, 'NassauCountyHookups': {'count': 5}, 'feminineboys': {'count': 1248}, 'dreamsmp': {'count': 2018}, 'SquaredCircle': {'count': 2255}, 'Minecraft': {'count': 8753}, 'spirituality': {'count': 1809}, 'Eldenring': {'count': 1471}, 'Sat': {'count': 1172}, 'bonnaroo': {'count': 194}, 'gardening': {'count': 1892}, 'Unemployment': {'count': 6185}, 'mac': {'count': 1847}, 'Bestbuy': {'count': 437}, 'quittingkratom': {'count': 1081}, 'lawschooladmissions': {'count': 3436}, 'NiceHash': {'count': 2135}, 'McMaster': {'count': 815}, 'covidlonghaulers': {'count': 1299}, 'stalker': {'count': 758}, 'MLBTheShow': {'count': 2721}, 'FortniteCompetitive': {'count': 998}, 'dpdr': {'count': 514}, 'appliancerepair': {'count': 720}, 'thomasthetankengine': {'count': 207}, 'delhi': {'count': 217}, 'Huel': {'count': 300}, 'leafs': {'count': 203}, 'HotWheels': {'count': 170}, '90dayfianceuncensored': {'count': 550}, 'Throwers': {'count': 142}, 'Wavyhair': {'count': 270}, 'CryptoHorde': {'count': 128}, 'ShuumatsuNoValkyrie': {'count': 453}, 'TeensMeetTeens': {'count': 432}, 'dbrand': {'count': 108}, 'SLFmeetups': {'count': 18}, '1200isplentyketo': {'count': 48}, 'passive_income': {'count': 211}, 'BroadCity': {'count': 16}, 'RevenantMain': {'count': 71}, 'extrarfl': {'count': 25}, 'AgonGame': {'count': 5}, 'FitnessDE': {'count': 3}, 'gaming': {'count': 1277}, 'livesound': {'count': 91}, 'IBO': {'count': 1896}, 'EscapefromTarkov': {'count': 1300}, 'amex': {'count': 145}, 'DMAcademy': {'count': 1411}, 'VinylCollectors': {'count': 556}, 'cardano': {'count': 716}, 'brave_browser': {'count': 159}, 'dating': {'count': 952}, 'OculusQuest': {'count': 942}, 'Superstonk': {'count': 3089}, 'MtF': {'count': 957}, 'findaleague': {'count': 207}, 'Nioh': {'count': 398}, 'IRS': {'count': 715}, 'transgendercirclejerk': {'count': 353}, 'learnmath': {'count': 489}, 'piano': {'count': 263}, 'LeagueConnect': {'count': 216}, 'eu4': {'count': 561}, 'Wordpress': {'count': 345}, 'RoleplayingForReddit': {'count': 31}, 'LOONA': {'count': 89}, 'newtothenavy': {'count': 167}, 'HaircareScience': {'count': 118}, 'appletv': {'count': 167}, 'sissypersonals': {'count': 102}, 'raleigh': {'count': 168}, 'realonlyfansreviews': {'count': 21}, 'AskGames': {'count': 49}, 'PokemonTCG': {'count': 325}, 'controlgame': {'count': 109}, 'GoogleDataStudio': {'count': 16}, 'WhiteWolfRPG': {'count': 139}, 'MECoOp': {'count': 31}, 'snuffrp': {'count': 46}, 'lockpicking': {'count': 103}, 'wicked_edge': {'count': 105}, 'BMW': {'count': 99}, 'choiceofgames': {'count': 24}, 'hisdarkmaterials': {'count': 12}, 'SakuraGakuin': {'count': 24}, 'detrans': {'count': 55}, 'Smallville': {'count': 37}, 'kingofqueens': {'count': 7}, 'JamesHoffmann': {'count': 22}, 'stashinvest': {'count': 16}, 'ABA': {'count': 79}, 'ladybusiness': {'count': 10}, 'gamegrumps': {'count': 32}, 'GodEater': {'count': 21}, 'tomorrow': {'count': 39}, 'Tomorrowland': {'count': 9}, 'BlackCountryNewRoad': {'count': 5}, 'STAYC': {'count': 3}, 'SatoshiStreetBets': {'count': 3828}, 'AskLosAngeles': {'count': 1036}, 'buildapcforme': {'count': 1689}, 'ApplyingToCollege': {'count': 10675}, 'watercooling': {'count': 1209}, 'BreakUps': {'count': 4914}, 'FIFA': {'count': 3811}, 'emacs': {'count': 712}, 'trakstocks': {'count': 691}, 'Shittyaskflying': {'count': 147}, 'AmazonFC': {'count': 1178}, 'stocks': {'count': 4610}, 'BangaloreMains': {'count': 26}, 'pokemon': {'count': 3953}, 'religion': {'count': 684}, 'cuboulder': {'count': 269}, 'self': {'count': 1688}, 'tarot': {'count': 912}, 'turtles': {'count': 49}, 'TheMagnusArchives': {'count': 300}, 'Superhero_Ideas': {'count': 34}, 'NTU': {'count': 308}, 'touhou': {'count': 623}, 'JoJolion': {'count': 50}, 'lasers': {'count': 27}, 'popperpigs': {'count': 67}, 'aggretsuko': {'count': 20}, 'Library': {'count': 5}}}} | +| [RenderedSST2](https://huggingface.co/datasets/clip-benchmark/wds_renderedsst2) | ['eng'] | ZeroShotClassification | i2t | [Reviews] | None | None | | [RestaurantReviewSentimentClassification](https://link.springer.com/chapter/10.1007/978-3-319-18117-2_2) (ElSahar et al., 2015) | ['ara'] | Classification | s2s | [Reviews, Written] | None | None | | [RiaNewsRetrieval](https://arxiv.org/abs/1901.07786) (Gavrilov et al., 2019) | ['rus'] | Retrieval | s2p | [News, Written] | None | None | | [RiaNewsRetrievalHardNegatives](https://arxiv.org/abs/1901.07786) (Gavrilov et al., 2019) | ['rus'] | Retrieval | s2p | [News, Written] | None | None | @@ -834,6 +601,7 @@ The following tables give you an overview of the tasks in MTEB. | [RuSciBenchGRNTIClusteringP2P](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Clustering | p2p | [Academic, Written] | {'test': 2048} | {'test': {'num_samples': 2048, 'number_of_characters': 1822339, 'min_text_length': 84, 'average_text_length': 889.81, 'max_text_length': 3143, 'min_labels_per_text': 73, 'average_labels_per_text': 1.0, 'max_labels_per_text': 74, 'unique_labels': 28, 'labels': {'3': {'count': 73}, '4': {'count': 73}, '20': {'count': 73}, '9': {'count': 73}, '21': {'count': 73}, '15': {'count': 73}, '16': {'count': 74}, '2': {'count': 73}, '8': {'count': 73}, '23': {'count': 73}, '6': {'count': 73}, '24': {'count': 73}, '10': {'count': 73}, '1': {'count': 73}, '17': {'count': 74}, '14': {'count': 74}, '18': {'count': 73}, '27': {'count': 73}, '19': {'count': 73}, '22': {'count': 73}, '12': {'count': 73}, '25': {'count': 73}, '5': {'count': 74}, '0': {'count': 73}, '26': {'count': 73}, '11': {'count': 73}, '13': {'count': 73}, '7': {'count': 73}}}} | | [RuSciBenchOECDClassification](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Classification | p2p | [Academic, Written] | None | None | | [RuSciBenchOECDClusteringP2P](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Clustering | p2p | [Academic, Written] | None | None | +| [SAMSumFa](https://huggingface.co/datasets/MCINext/samsum-fa) | ['fas'] | BitextMining | s2p | [Spoken] | None | None | | [SCDBPAccountabilityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [SCDBPAuditsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [SCDBPCertificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | @@ -844,87 +612,73 @@ The following tables give you an overview of the tasks in MTEB. | [SCDDCertificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [SCDDTrainingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [SCDDVerificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [SCIDOCS](https://allenai.org/data/scidocs) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Written, Non-fiction] | None | None | +| [SCIDOCS](https://allenai.org/data/scidocs) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | +| [SCIDOCS-Fa](https://huggingface.co/datasets/MCINext/scidocs-fa) | ['fas'] | Retrieval | s2p | [Academic] | None | None | | [SCIDOCS-PL](https://allenai.org/data/scidocs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | +| [SDSEyeProtectionClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2p | [Chemistry] | None | None | +| [SDSGlovesClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2p | [Chemistry] | None | None | | [SIB200Classification](https://arxiv.org/abs/2309.07445) (Adelani et al., 2023) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nqo', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | Classification | s2s | [News, Written] | None | None | | [SIB200ClusteringS2S](https://arxiv.org/abs/2309.07445) (Adelani et al., 2023) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nqo', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | Clustering | s2s | [News, Written] | None | None | | [SICK-BR-PC](https://linux.ime.usp.br/~thalen/SICK_PT.pdf) | ['por'] | PairClassification | s2s | [Web, Written] | None | None | | [SICK-BR-STS](https://linux.ime.usp.br/~thalen/SICK_PT.pdf) | ['por'] | STS | s2s | [Web, Written] | None | None | ->>>>>>> main | [SICK-E-PL](https://aclanthology.org/2020.lrec-1.207) | ['pol'] | PairClassification | s2s | | None | None | | [SICK-R](https://aclanthology.org/L14-1314/) | ['eng'] | STS | s2s | [Web, Written] | None | None | | [SICK-R-PL](https://aclanthology.org/2020.lrec-1.207) | ['pol'] | STS | s2s | [Web, Written] | None | None | | [SICKFr](https://huggingface.co/datasets/Lajavaness/SICK-fr) | ['fra'] | STS | s2s | | None | None | -<<<<<<< HEAD -| [SIQA](https://leaderboard.allenai.org/socialiqa/submissions/get-started) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | {'test': 0} | {'test': {'average_document_length': 22.967085695044617, 'average_query_length': 127.75383828045035, 'num_documents': 71276, 'num_queries': 1954, 'average_relevant_docs_per_query': 1.0}} | -| [SKQuadRetrieval](https://huggingface.co/datasets/TUKE-KEMT/retrieval-skquad) | ['slk'] | Retrieval | s2s | [Encyclopaedic] | {'test': 1134} | {'test': {'average_document_length': 1180.5071792496526, 'average_query_length': 53.63403880070547, 'num_documents': 6477, 'num_queries': 1134, 'average_relevant_docs_per_query': 11}} | -| [SNLHierarchicalClusteringP2P](https://huggingface.co/datasets/navjordj/SNL_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | p2p | [Encyclopaedic, Non-fiction, Written] | {'test': 1300} | {'test': 1986.9453846153847} | -| [SNLHierarchicalClusteringS2S](https://huggingface.co/datasets/navjordj/SNL_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | s2s | [Encyclopaedic, Non-fiction, Written] | {'test': 1300} | {'test': 242.22384615384615} | -| [SNLRetrieval](https://huggingface.co/datasets/navjordj/SNL_summarization) (Navjord et al., 2023) | ['nob'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Written] | {'test': 2048} | {'test': {'average_document_length': 1986.9453846153847, 'average_query_length': 14.906153846153845, 'num_documents': 1300, 'num_queries': 1300, 'average_relevant_docs_per_query': 1.0}} | -| [SRNCorpusBitextMining](https://arxiv.org/abs/2212.06383) (Zwennicker et al., 2022) | ['nld', 'srn'] | BitextMining | s2s | [Social, Web, Written] | {'test': 256} | {'test': 55} | -| [STS12](https://www.aclweb.org/anthology/S12-1051.pdf) (Agirre et al., 2012) | ['eng'] | STS | s2s | [Encyclopaedic, News, Written] | {'test': 6216} | {'test': {'num_samples': 3108, 'average_sentence1_len': 63.78893178893179, 'average_sentence2_len': 65.5926640926641, 'avg_score': 3.5060643500643507}} | -| [STS13](https://www.aclweb.org/anthology/S13-1004/) (Eneko Agirre, 2013) | ['eng'] | STS | s2s | [Web, News, Non-fiction, Written] | {'test': 3000} | {'test': 54.0} | -| [STS14](https://www.aclweb.org/anthology/S14-1002) | ['eng'] | STS | s2s | [Blog, Web, Spoken] | {'test': 7500} | {'test': 54.3} | -| [STS15](https://www.aclweb.org/anthology/S15-2010) | ['eng'] | STS | s2s | [Blog, News, Web, Written, Spoken] | {'test': 6000} | {'test': 57.7} | -| [STS16](https://www.aclweb.org/anthology/S16-1001) | ['eng'] | STS | s2s | [Blog, Web, Spoken] | {'test': 2372} | {'test': 65.3} | -| [STS17](https://alt.qcri.org/semeval2017/task1/) | ['ara', 'deu', 'eng', 'fra', 'ita', 'kor', 'nld', 'spa', 'tur'] | STS | s2s | [News, Web, Written] | {'test': 500} | {'test': {'num_samples': 5346, 'average_sentence1_len': 38.14665170220726, 'average_sentence2_len': 36.72502805836139, 'avg_score': 2.3554804214989464, 'hf_subset_descriptive_stats': {'ko-ko': {'num_samples': 2846, 'average_sentence1_len': 31.991918482080113, 'average_sentence2_len': 32.44483485593816, 'avg_score': 2.469359920356055}, 'ar-ar': {'num_samples': 250, 'average_sentence1_len': 32.208, 'average_sentence2_len': 32.78, 'avg_score': 2.216800000000001}, 'en-ar': {'num_samples': 250, 'average_sentence1_len': 42.36, 'average_sentence2_len': 32.696, 'avg_score': 2.1423999999999994}, 'en-de': {'num_samples': 250, 'average_sentence1_len': 43.952, 'average_sentence2_len': 44.756, 'avg_score': 2.2776000000000014}, 'en-en': {'num_samples': 250, 'average_sentence1_len': 43.952, 'average_sentence2_len': 42.724, 'avg_score': 2.2776000000000014}, 'en-tr': {'num_samples': 250, 'average_sentence1_len': 41.916, 'average_sentence2_len': 41.6, 'avg_score': 2.1335999999999986}, 'es-en': {'num_samples': 250, 'average_sentence1_len': 50.84, 'average_sentence2_len': 42.024, 'avg_score': 2.1464000000000003}, 'es-es': {'num_samples': 250, 'average_sentence1_len': 49.836, 'average_sentence2_len': 51.224, 'avg_score': 2.2312000000000007}, 'fr-en': {'num_samples': 250, 'average_sentence1_len': 49.624, 'average_sentence2_len': 42.724, 'avg_score': 2.2776000000000014}, 'it-en': {'num_samples': 250, 'average_sentence1_len': 50.028, 'average_sentence2_len': 42.724, 'avg_score': 2.2776000000000014}, 'nl-en': {'num_samples': 250, 'average_sentence1_len': 46.816, 'average_sentence2_len': 42.724, 'avg_score': 2.2776000000000014}}}} | -| [STS22.v2](https://competitions.codalab.org/competitions/33835) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'pol', 'rus', 'spa', 'tur'] | STS | p2p | [News, Written] | {'test': 3958} | {'test': 1993.6} | -======= +| [SIDClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Academic] | None | None | +| [SIDClustring](https://www.sid.com/) | ['fas'] | Clustering | p2p | [Academic] | None | None | | [SIQA](https://leaderboard.allenai.org/socialiqa/submissions/get-started) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [SKQuadRetrieval](https://huggingface.co/datasets/TUKE-KEMT/retrieval-skquad) | ['slk'] | Retrieval | s2s | [Encyclopaedic] | None | None | | [SNLHierarchicalClusteringP2P](https://huggingface.co/datasets/navjordj/SNL_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | p2p | [Encyclopaedic, Non-fiction, Written] | None | None | | [SNLHierarchicalClusteringS2S](https://huggingface.co/datasets/navjordj/SNL_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | s2s | [Encyclopaedic, Non-fiction, Written] | None | None | | [SNLRetrieval](https://huggingface.co/datasets/navjordj/SNL_summarization) (Navjord et al., 2023) | ['nob'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Written] | None | None | +| [SOPI2IRetrieval](https://paperswithcode.com/dataset/stanford-online-products) (Oh Song et al., 2016) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | None | None | | [SRNCorpusBitextMining](https://arxiv.org/abs/2212.06383) (Zwennicker et al., 2022) | ['nld', 'srn'] | BitextMining | s2s | [Social, Web, Written] | None | None | +| [STL10](https://cs.stanford.edu/~acoates/stl10/) (Coates et al., 2011) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [STL10ZeroShot](https://cs.stanford.edu/~acoates/stl10/) (Coates et al., 2011) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | | [STS12](https://www.aclweb.org/anthology/S12-1051.pdf) (Agirre et al., 2012) | ['eng'] | STS | s2s | [Encyclopaedic, News, Written] | {'test': 3108} | {'test': {'num_samples': 3108, 'number_of_characters': 402118, 'min_sentence1_length': 3, 'average_sentence1_len': 63.79, 'max_sentence1_length': 220, 'unique_sentence1': 2236, 'min_sentence2_length': 7, 'average_sentence2_len': 65.59, 'max_sentence2_length': 204, 'unique_sentence2': 2797, 'min_score': 0.0, 'avg_score': 3.51, 'max_score': 5.0}} | -| [STS13](https://www.aclweb.org/anthology/S13-1004/) (Eneko Agirre, 2013) | ['eng'] | STS | s2s | [Web, News, Non-fiction, Written] | None | None | -| [STS14](https://www.aclweb.org/anthology/S14-1002) | ['eng'] | STS | s2s | [Blog, Web, Spoken] | None | None | -| [STS15](https://www.aclweb.org/anthology/S15-2010) | ['eng'] | STS | s2s | [Blog, News, Web, Written, Spoken] | None | None | -| [STS16](https://www.aclweb.org/anthology/S16-1001) | ['eng'] | STS | s2s | [Blog, Web, Spoken] | None | None | +| [STS12VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS | i2i | [Encyclopaedic, News, Written] | None | None | +| [STS13](https://www.aclweb.org/anthology/S13-1004/) (Eneko Agirre, 2013) | ['eng'] | STS | s2s | [News, Non-fiction, Web, Written] | None | None | +| [STS13VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS | i2i | [News, Non-fiction, Web, Written] | None | None | +| [STS14](https://www.aclweb.org/anthology/S14-1002) | ['eng'] | STS | s2s | [Blog, Spoken, Web] | None | None | +| [STS14VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS | i2i | [Blog, Spoken, Web] | None | None | +| [STS15](https://www.aclweb.org/anthology/S15-2010) | ['eng'] | STS | s2s | [Blog, News, Spoken, Web, Written] | None | None | +| [STS15VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS | i2i | [Blog, News, Spoken, Web, Written] | None | None | +| [STS16](https://www.aclweb.org/anthology/S16-1001) | ['eng'] | STS | s2s | [Blog, Spoken, Web] | None | None | +| [STS16VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS | i2i | [Blog, Spoken, Web] | None | None | | [STS17](https://alt.qcri.org/semeval2017/task1/) | ['ara', 'deu', 'eng', 'fra', 'ita', 'kor', 'nld', 'spa', 'tur'] | STS | s2s | [News, Web, Written] | {'test': 5346} | {'test': {'num_samples': 5346, 'number_of_characters': 400264, 'min_sentence1_length': 6, 'average_sentence1_len': 38.15, 'max_sentence1_length': 976, 'unique_sentence1': 4900, 'min_sentence2_length': 6, 'average_sentence2_len': 36.73, 'max_sentence2_length': 1007, 'unique_sentence2': 4470, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0, 'hf_subset_descriptive_stats': {'ko-ko': {'num_samples': 2846, 'number_of_characters': 183387, 'min_sentence1_length': 6, 'average_sentence1_len': 31.99, 'max_sentence1_length': 976, 'unique_sentence1': 2650, 'min_sentence2_length': 6, 'average_sentence2_len': 32.44, 'max_sentence2_length': 1007, 'unique_sentence2': 2720, 'min_score': 0.0, 'avg_score': 2.47, 'max_score': 5.0}, 'ar-ar': {'num_samples': 250, 'number_of_characters': 16247, 'min_sentence1_length': 11, 'average_sentence1_len': 32.21, 'max_sentence1_length': 99, 'unique_sentence1': 250, 'min_sentence2_length': 9, 'average_sentence2_len': 32.78, 'max_sentence2_length': 83, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.22, 'max_score': 5.0}, 'en-ar': {'num_samples': 250, 'number_of_characters': 18764, 'min_sentence1_length': 13, 'average_sentence1_len': 42.36, 'max_sentence1_length': 105, 'unique_sentence1': 250, 'min_sentence2_length': 10, 'average_sentence2_len': 32.7, 'max_sentence2_length': 104, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.14, 'max_score': 5.0}, 'en-de': {'num_samples': 250, 'number_of_characters': 22177, 'min_sentence1_length': 12, 'average_sentence1_len': 43.95, 'max_sentence1_length': 94, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 44.76, 'max_sentence2_length': 104, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'en-en': {'num_samples': 250, 'number_of_characters': 21669, 'min_sentence1_length': 12, 'average_sentence1_len': 43.95, 'max_sentence1_length': 94, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'en-tr': {'num_samples': 250, 'number_of_characters': 20879, 'min_sentence1_length': 15, 'average_sentence1_len': 41.92, 'max_sentence1_length': 101, 'unique_sentence1': 250, 'min_sentence2_length': 10, 'average_sentence2_len': 41.6, 'max_sentence2_length': 107, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.13, 'max_score': 5.0}, 'es-en': {'num_samples': 250, 'number_of_characters': 23216, 'min_sentence1_length': 12, 'average_sentence1_len': 50.84, 'max_sentence1_length': 160, 'unique_sentence1': 250, 'min_sentence2_length': 14, 'average_sentence2_len': 42.02, 'max_sentence2_length': 117, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.15, 'max_score': 5.0}, 'es-es': {'num_samples': 250, 'number_of_characters': 25265, 'min_sentence1_length': 18, 'average_sentence1_len': 49.84, 'max_sentence1_length': 136, 'unique_sentence1': 250, 'min_sentence2_length': 13, 'average_sentence2_len': 51.22, 'max_sentence2_length': 129, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.23, 'max_score': 5.0}, 'fr-en': {'num_samples': 250, 'number_of_characters': 23087, 'min_sentence1_length': 19, 'average_sentence1_len': 49.62, 'max_sentence1_length': 115, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'it-en': {'num_samples': 250, 'number_of_characters': 23188, 'min_sentence1_length': 15, 'average_sentence1_len': 50.03, 'max_sentence1_length': 113, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'nl-en': {'num_samples': 250, 'number_of_characters': 22385, 'min_sentence1_length': 14, 'average_sentence1_len': 46.82, 'max_sentence1_length': 123, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}}}} | +| [STS17MultilingualVisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['ara', 'deu', 'eng', 'fra', 'ita', 'kor', 'nld', 'spa', 'tur'] | VisualSTS | i2i | [News, Social, Spoken, Web, Written] | None | None | | [STS22.v2](https://competitions.codalab.org/competitions/33835) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'pol', 'rus', 'spa', 'tur'] | STS | p2p | [News, Written] | None | None | ->>>>>>> main | [STSB](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None | -| [STSBenchmark](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['eng'] | STS | s2s | | None | None | -| [STSBenchmarkMultilingualSTS](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['cmn', 'deu', 'eng', 'fra', 'ita', 'nld', 'pol', 'por', 'rus', 'spa'] | STS | s2s | [News, Social, Web, Spoken, Written] | None | None | +| [STSBenchmark](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['eng'] | STS | s2s | [Blog, News, Written] | None | None | +| [STSBenchmarkMultilingualSTS](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['cmn', 'deu', 'eng', 'fra', 'ita', 'nld', 'pol', 'por', 'rus', 'spa'] | STS | s2s | [News, Social, Spoken, Web, Written] | None | None | +| [STSBenchmarkMultilingualVisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['cmn', 'deu', 'eng', 'fra', 'ita', 'nld', 'pol', 'por', 'rus', 'spa'] | VisualSTS | i2i | [News, Social, Spoken, Web, Written] | None | None | | [STSES](https://huggingface.co/datasets/PlanTL-GOB-ES/sts-es) (Agirre et al., 2015) | ['spa'] | STS | s2s | [Written] | None | None | -<<<<<<< HEAD -| [SadeemQuestionRetrieval](https://huggingface.co/datasets/sadeem-ai/sadeem-ar-eval-retrieval-questions) | ['ara'] | Retrieval | s2p | [Written, Written] | {'test': 22979} | {'test': 500.0} | -| [SanskritShlokasClassification](https://github.com/goru001/nlp-for-sanskrit) | ['san'] | Classification | s2s | [Religious, Written] | {'train': 383, 'validation': 96} | {'train': 98.415, 'validation': 96.635} | -| [ScalaClassification](https://aclanthology.org/2023.nodalida-1.20/) | ['dan', 'nno', 'nob', 'swe'] | Classification | s2s | [Fiction, News, Non-fiction, Blog, Spoken, Web, Written] | {'test': 4096} | {'test': 102.72} | -| [SciDocsRR](https://allenai.org/data/scidocs) | ['eng'] | Reranking | s2s | [Academic, Non-fiction, Written] | {'test': 19599} | {'test': 69.0} | -| [SciFact](https://github.com/allenai/scifact) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | | None | {'train': {'average_document_length': 1498.4152035500674, 'average_query_length': 88.58838071693448, 'num_documents': 5183, 'num_queries': 809, 'average_relevant_docs_per_query': 1.1359703337453646}, 'test': {'average_document_length': 1498.4152035500674, 'average_query_length': 90.34666666666666, 'num_documents': 5183, 'num_queries': 300, 'average_relevant_docs_per_query': 1.13}} | -| [SciFact-PL](https://github.com/allenai/scifact) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | {'test': {'average_document_length': 1553.5178468068686, 'average_query_length': 95.44, 'num_documents': 5183, 'num_queries': 300, 'average_relevant_docs_per_query': 1.13}} | -| [SemRel24STS](https://huggingface.co/datasets/SemRel/SemRel2024) (Nedjma Ousidhoum, 2024) | ['afr', 'amh', 'arb', 'arq', 'ary', 'eng', 'hau', 'hin', 'ind', 'kin', 'mar', 'tel'] | STS | s2s | [Spoken, Written] | {'dev': 2089, 'test': 7498} | {'dev': 163.1, 'test': 145.9} | -| [SensitiveTopicsClassification](https://aclanthology.org/2021.bsnlp-1.4) | ['rus'] | MultilabelClassification | s2s | [Web, Social, Written] | {'test': 2048} | {'test': 95.3} | -| [SentimentAnalysisHindi](https://huggingface.co/datasets/OdiaGenAI/sentiment_analysis_hindi) (Shantipriya Parida, 2023) | ['hin'] | Classification | s2s | [Reviews, Written] | {'train': 2497} | {'train': 81.29} | -| [SinhalaNewsClassification](https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Category-classification) (Nisansa de Silva, 2015) | ['sin'] | Classification | s2s | [News, Written] | {'train': 3327} | {'train': 148.04} | -| [SinhalaNewsSourceClassification](https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Source-classification) (Dhananjaya et al., 2022) | ['sin'] | Classification | s2s | [News, Written] | {'train': 24094} | {'train': 56.08} | -| [SiswatiNewsClassification](https://huggingface.co/datasets/dsfsi/za-isizulu-siswati-news) (Madodonga et al., 2023) | ['ssw'] | Classification | s2s | [News, Written] | {'train': 80} | {'train': 354.2} | -| [SlovakHateSpeechClassification](https://huggingface.co/datasets/TUKE-KEMT/hate_speech_slovak) | ['slk'] | Classification | s2s | [Social, Written] | {'test': 1319} | {'test': 92.71} | -| [SlovakMovieReviewSentimentClassification](https://arxiv.org/pdf/2304.01922) ({ {S, 2023) | ['svk'] | Classification | s2s | [Reviews, Written] | {'test': 2048} | {'test': 366.17} | -| [SlovakSumRetrieval](https://huggingface.co/datasets/NaiveNeuron/slovaksum) | ['slk'] | Retrieval | s2s | [News, Social, Web, Written] | {'test': 600} | {'test': {'average_document_length': 2156.445, 'average_query_length': 143.59833333333333, 'num_documents': 600, 'num_queries': 600, 'average_relevant_docs_per_query': 1.0}} | -| [SouthAfricanLangClassification](https://www.kaggle.com/competitions/south-african-language-identification/) (ExploreAI Academy et al., 2022) | ['afr', 'eng', 'nbl', 'nso', 'sot', 'ssw', 'tsn', 'tso', 'ven', 'xho', 'zul'] | Classification | s2s | [Web, Non-fiction, Written] | {'test': 2048} | {'test': 247.49} | -| [SpanishNewsClassification](https://huggingface.co/datasets/MarcOrfilaCarreras/spanish-news) | ['spa'] | Classification | s2s | [News, Written] | {'train': 2048} | {'train': 4218.2} | -======= +| [SUN397](https://ieeexplore.ieee.org/abstract/document/5539970) (Xiao et al., 2010) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [SUN397ZeroShot](https://ieeexplore.ieee.org/abstract/document/5539970) (Xiao et al., 2010) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | | [SadeemQuestionRetrieval](https://huggingface.co/datasets/sadeem-ai/sadeem-ar-eval-retrieval-questions) | ['ara'] | Retrieval | s2p | [Written, Written] | None | None | | [SanskritShlokasClassification](https://github.com/goru001/nlp-for-sanskrit) | ['san'] | Classification | s2s | [Religious, Written] | None | None | -| [ScalaClassification](https://aclanthology.org/2023.nodalida-1.20/) | ['dan', 'nno', 'nob', 'swe'] | Classification | s2s | [Fiction, News, Non-fiction, Blog, Spoken, Web, Written] | None | None | +| [ScalaClassification](https://aclanthology.org/2023.nodalida-1.20/) | ['dan', 'nno', 'nob', 'swe'] | Classification | s2s | [Blog, Fiction, News, Non-fiction, Spoken, Web, Written] | None | None | | [SciDocsRR](https://allenai.org/data/scidocs) | ['eng'] | Reranking | s2s | [Academic, Non-fiction, Written] | None | None | | [SciFact](https://github.com/allenai/scifact) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | +| [SciFact-Fa](https://huggingface.co/datasets/MCINext/scifact-fa) | ['fas'] | Retrieval | s2p | [Academic] | None | None | | [SciFact-PL](https://github.com/allenai/scifact) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | +| [SciMMIR](https://huggingface.co/datasets/m-a-p/SciMMIR) (Siwei Wu, 2024) | ['eng'] | ZeroShotClassification | i2t | [Academic] | None | None | +| [SciMMIRI2TRetrieval](https://aclanthology.org/2024.findings-acl.746/) (Wu et al., 2024) | ['eng'] | Any2AnyRetrieval | i2t | [Academic] | None | None | +| [SciMMIRT2IRetrieval](https://aclanthology.org/2024.findings-acl.746/) (Wu et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None | | [SemRel24STS](https://huggingface.co/datasets/SemRel/SemRel2024) (Nedjma Ousidhoum, 2024) | ['afr', 'amh', 'arb', 'arq', 'ary', 'eng', 'hau', 'hin', 'ind', 'kin', 'mar', 'tel'] | STS | s2s | [Spoken, Written] | None | None | -| [SensitiveTopicsClassification](https://aclanthology.org/2021.bsnlp-1.4) | ['rus'] | MultilabelClassification | s2s | [Web, Social, Written] | None | None | +| [SensitiveTopicsClassification](https://aclanthology.org/2021.bsnlp-1.4) | ['rus'] | MultilabelClassification | s2s | [Social, Web, Written] | None | None | | [SentimentAnalysisHindi](https://huggingface.co/datasets/OdiaGenAI/sentiment_analysis_hindi) (Shantipriya Parida, 2023) | ['hin'] | Classification | s2s | [Reviews, Written] | None | None | +| [SentimentDKSF](https://github.com/hezarai/hezar) | ['fas'] | Classification | s2p | [Reviews] | None | None | | [SinhalaNewsClassification](https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Category-classification) (Nisansa de Silva, 2015) | ['sin'] | Classification | s2s | [News, Written] | None | None | | [SinhalaNewsSourceClassification](https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Source-classification) (Dhananjaya et al., 2022) | ['sin'] | Classification | s2s | [News, Written] | None | None | | [SiswatiNewsClassification](https://huggingface.co/datasets/dsfsi/za-isizulu-siswati-news) (Madodonga et al., 2023) | ['ssw'] | Classification | s2s | [News, Written] | None | None | +| [SketchyI2IRetrieval](https://arxiv.org/abs/2202.01747) (Ypsilantis et al., 2021) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | None | None | | [SlovakHateSpeechClassification](https://huggingface.co/datasets/TUKE-KEMT/hate_speech_slovak) | ['slk'] | Classification | s2s | [Social, Written] | {'test': 1319, 'train': 11870} | {'test': {'num_samples': 1319, 'number_of_characters': 122279, 'num_texts_in_train': 46, 'min_text_length': 8, 'average_text_length': 92.71, 'max_text_length': 1584, 'unique_text': 1315, 'unique_labels': 2, 'labels': {'1': {'count': 360}, '0': {'count': 959}}}, 'train': {'num_samples': 11870, 'number_of_characters': 1130860, 'num_texts_in_train': None, 'min_text_length': 7, 'average_text_length': 95.27, 'max_text_length': 2112, 'unique_text': 11655, 'unique_labels': 2, 'labels': {'1': {'count': 3245}, '0': {'count': 8625}}}} | | [SlovakMovieReviewSentimentClassification](https://arxiv.org/pdf/2304.01922) ({ {S, 2023) | ['svk'] | Classification | s2s | [Reviews, Written] | None | None | | [SlovakSumRetrieval](https://huggingface.co/datasets/NaiveNeuron/slovaksum) | ['slk'] | Retrieval | s2s | [News, Social, Web, Written] | None | None | -| [SouthAfricanLangClassification](https://www.kaggle.com/competitions/south-african-language-identification/) (ExploreAI Academy et al., 2022) | ['afr', 'eng', 'nbl', 'nso', 'sot', 'ssw', 'tsn', 'tso', 'ven', 'xho', 'zul'] | Classification | s2s | [Web, Non-fiction, Written] | None | None | +| [SouthAfricanLangClassification](https://www.kaggle.com/competitions/south-african-language-identification/) (ExploreAI Academy et al., 2022) | ['afr', 'eng', 'nbl', 'nso', 'sot', 'ssw', 'tsn', 'tso', 'ven', 'xho', 'zul'] | Classification | s2s | [Non-fiction, Web, Written] | None | None | | [SpanishNewsClassification](https://huggingface.co/datasets/MarcOrfilaCarreras/spanish-news) | ['spa'] | Classification | s2s | [News, Written] | None | None | ->>>>>>> main | [SpanishNewsClusteringP2P](https://www.kaggle.com/datasets/kevinmorgado/spanish-news-classification) | ['spa'] | Clustering | p2p | | None | None | | [SpanishPassageRetrievalS2P](https://mklab.iti.gr/results/spanish-passage-retrieval-dataset/) | ['spa'] | Retrieval | s2p | | None | None | | [SpanishPassageRetrievalS2S](https://mklab.iti.gr/results/spanish-passage-retrieval-dataset/) | ['spa'] | Retrieval | s2s | | None | None | @@ -933,9 +687,13 @@ The following tables give you an overview of the tasks in MTEB. | [SprintDuplicateQuestions](https://www.aclweb.org/anthology/D18-1131/) | ['eng'] | PairClassification | s2s | [Programming, Written] | None | None | | [StackExchangeClustering.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | s2s | [Web, Written] | None | None | | [StackExchangeClusteringP2P.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | p2p | [Web, Written] | None | None | -| [StackOverflowDupQuestions](https://www.microsoft.com/en-us/research/uploads/prod/2019/03/nl4se18LinkSO.pdf) (Xueqing Liu, 2018) | ['eng'] | Reranking | s2s | | None | None | +| [StackOverflowDupQuestions](https://www.microsoft.com/en-us/research/uploads/prod/2019/03/nl4se18LinkSO.pdf) (Xueqing Liu, 2018) | ['eng'] | Reranking | s2s | [Blog, Programming, Written] | None | None | | [StackOverflowQA](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 21925} | {'test': {'number_of_characters': 26584028, 'num_samples': 21925, 'num_queries': 1994, 'num_documents': 19931, 'min_document_length': 61, 'average_document_length': 130.32, 'max_document_length': 22234, 'unique_documents': 19931, 'min_query_length': 5, 'average_query_length': 12029.38, 'max_query_length': 46028, 'unique_queries': 1994, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1994}} | +| [StanfordCars](https://pure.mpg.de/rest/items/item_2029263/component/file_2029262/content) (Jonathan Krause, 2013) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [StanfordCarsI2IRetrieval](https://pure.mpg.de/rest/items/item_2029263/component/file_2029262/content) (Jonathan Krause, 2013) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | None | None | +| [StanfordCarsZeroShot](https://pure.mpg.de/rest/items/item_2029263/component/file_2029262/content) (Jonathan Krause, 2013) | ['eng'] | ZeroShotClassification | i2t | [Scene] | None | None | | [StatcanDialogueDatasetRetrieval](https://mcgill-nlp.github.io/statcan-dialogue-dataset/) | ['eng', 'fra'] | Retrieval | s2p | [Government, Web, Written] | None | None | +| [SugarCrepe](https://proceedings.neurips.cc/paper_files/paper/2023/hash/63461de0b4cb760fc498e85b18a7fe81-Abstract-Datasets_and_Benchmarks.html) (Hsieh et al., 2024) | ['eng'] | ImageTextPairClassification | i2t | [Encyclopaedic] | None | None | | [SummEvalFrSummarization.v2](https://github.com/Yale-LILY/SummEval) (Fabbri et al., 2020) | ['fra'] | Summarization | p2p | [News, Written] | None | None | | [SummEvalSummarization.v2](https://github.com/Yale-LILY/SummEval) (Fabbri et al., 2020) | ['eng'] | Summarization | p2p | [News, Written] | None | None | | [SwahiliNewsClassification](https://huggingface.co/datasets/Mollel/SwahiliNewsClassification) | ['swa'] | Classification | s2s | [News, Written] | None | None | @@ -946,6 +704,33 @@ The following tables give you an overview of the tasks in MTEB. | [SwednClusteringS2S](https://spraakbanken.gu.se/en/resources/swedn) (Monsen et al., 2021) | ['swe'] | Clustering | s2s | [News, Non-fiction, Written] | None | None | | [SwednRetrieval](https://spraakbanken.gu.se/en/resources/swedn) (Monsen et al., 2021) | ['swe'] | Retrieval | p2p | [News, Non-fiction, Written] | None | None | | [SwissJudgementClassification](https://aclanthology.org/2021.nllp-1.3/) (Joel Niklaus, 2022) | ['deu', 'fra', 'ita'] | Classification | s2s | [Legal, Written] | None | None | +| [SynPerChatbotConvSAAnger](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| SynPerChatbotConvSAClassification | ['fas'] | Classification | None | [Spoken] | None | None | +| [SynPerChatbotConvSAFear](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotConvSAFriendship](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotConvSAHappiness](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotConvSAJealousy](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotConvSALove](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotConvSASadness](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotConvSASatisfaction](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotConvSASurprise](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotConvSAToneChatbotClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotConvSAToneUserClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotRAGFAQPC](https://mcinext.com/) | ['fas'] | PairClassification | s2p | [Spoken] | None | None | +| [SynPerChatbotRAGFAQRetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-rag-faq-retrieval) | ['fas'] | Retrieval | s2p | [Spoken] | None | None | +| [SynPerChatbotRAGSumSRetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-rag-summary-retrieval) | ['fas'] | BitextMining | p2p | [Spoken] | None | None | +| [SynPerChatbotRAGToneChatbotClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotRAGToneUserClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotRAGTopicsRetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-rag-topics-retrieval) | ['fas'] | Retrieval | s2p | [Spoken] | None | None | +| [SynPerChatbotSatisfactionLevelClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotSumSRetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-summary-retrieval) | ['fas'] | BitextMining | p2p | [Spoken] | None | None | +| [SynPerChatbotToneChatbotClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotToneUserClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotTopicsRetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-topics-retrieval) | ['fas'] | Retrieval | s2p | [Spoken] | None | None | +| [SynPerQAPC](https://mcinext.com/) | ['fas'] | PairClassification | s2p | [Blog, News, Religious, Web] | None | None | +| [SynPerQARetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-qa-retrieval/settings) | ['fas'] | Retrieval | s2p | [Web] | None | None | +| [SynPerSTS](https://mcinext.com/) | ['fas'] | STS | s2s | [Blog, News, Religious, Web] | None | None | +| [SynPerTextKeywordsPC](https://mcinext.com/) | ['fas'] | PairClassification | s2p | [Blog, News, Religious, Web] | None | None | | [SyntecReranking](https://huggingface.co/datasets/lyon-nlp/mteb-fr-reranking-syntec-s2p) (Mathieu Ciancone, 2024) | ['fra'] | Reranking | s2p | [Legal, Written] | None | None | | [SyntecRetrieval](https://huggingface.co/datasets/lyon-nlp/mteb-fr-retrieval-syntec-s2p) (Mathieu Ciancone, 2024) | ['fra'] | Retrieval | s2p | [Legal, Written] | None | None | | [SyntheticText2SQL](https://huggingface.co/datasets/gretelai/synthetic_text_to_sql) (Meyer et al., 2024) | ['eng', 'sql'] | Retrieval | p2p | [Programming, Written] | {'test': 111702} | {'test': {'number_of_characters': 14041553, 'num_samples': 111702, 'num_queries': 5851, 'num_documents': 105851, 'min_document_length': 13, 'average_document_length': 4.58, 'max_document_length': 281, 'unique_documents': 105851, 'min_query_length': 17, 'average_query_length': 2316.95, 'max_query_length': 762, 'unique_queries': 5851, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 5851}} | @@ -953,63 +738,10 @@ The following tables give you an overview of the tasks in MTEB. | [T2Retrieval](https://arxiv.org/abs/2304.03679) (Xiaohui Xie, 2023) | ['cmn'] | Retrieval | s2p | | None | None | | [TERRa](https://arxiv.org/pdf/2010.15925) (Shavrina et al., 2020) | ['rus'] | PairClassification | s2s | [News, Web, Written] | None | None | | [TNews](https://www.cluebenchmarks.com/introduce.html) | ['cmn'] | Classification | s2s | | None | None | -<<<<<<< HEAD -| [TRECCOVID](https://ir.nist.gov/covidSubmit/index.html) (Kirk Roberts, 2021) | ['eng'] | Retrieval | s2p | | None | {'test': {'average_document_length': 1116.7434221277986, 'average_query_length': 69.24, 'num_documents': 171332, 'num_queries': 50, 'average_relevant_docs_per_query': 493.5}} | -| [TRECCOVID-PL](https://ir.nist.gov/covidSubmit/index.html) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | {'test': {'average_document_length': 1159.8020276422385, 'average_query_length': 69.42, 'num_documents': 171332, 'num_queries': 50, 'average_relevant_docs_per_query': 493.5}} | -| [TV2Nordretrieval](https://huggingface.co/datasets/alexandrainst/nordjylland-news-summarization) | ['dan'] | Retrieval | p2p | [News, Non-fiction, Written] | {'test': 4096} | {'test': {'average_document_length': 1440.66552734375, 'average_query_length': 126.552734375, 'num_documents': 2048, 'num_queries': 2048, 'average_relevant_docs_per_query': 1.0}} | -| [TamilNewsClassification](https://github.com/vanangamudi/tamil-news-classification) (Anoop Kunchukuttan, 2020) | ['tam'] | Classification | s2s | [News, Written] | {'train': 14521, 'test': 3631} | {'train': 56.5, 'test': 56.52} | -| [Tatoeba](https://github.com/facebookresearch/LASER/tree/main/data/tatoeba/v1) (Tatoeba community, 2021) | ['afr', 'amh', 'ang', 'ara', 'arq', 'arz', 'ast', 'awa', 'aze', 'bel', 'ben', 'ber', 'bos', 'bre', 'bul', 'cat', 'cbk', 'ceb', 'ces', 'cha', 'cmn', 'cor', 'csb', 'cym', 'dan', 'deu', 'dsb', 'dtp', 'ell', 'eng', 'epo', 'est', 'eus', 'fao', 'fin', 'fra', 'fry', 'gla', 'gle', 'glg', 'gsw', 'heb', 'hin', 'hrv', 'hsb', 'hun', 'hye', 'ido', 'ile', 'ina', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kat', 'kaz', 'khm', 'kor', 'kur', 'kzj', 'lat', 'lfn', 'lit', 'lvs', 'mal', 'mar', 'max', 'mhr', 'mkd', 'mon', 'nds', 'nld', 'nno', 'nob', 'nov', 'oci', 'orv', 'pam', 'pes', 'pms', 'pol', 'por', 'ron', 'rus', 'slk', 'slv', 'spa', 'sqi', 'srp', 'swe', 'swg', 'swh', 'tam', 'tat', 'tel', 'tgl', 'tha', 'tuk', 'tur', 'tzl', 'uig', 'ukr', 'urd', 'uzb', 'vie', 'war', 'wuu', 'xho', 'yid', 'yue', 'zsm'] | BitextMining | s2s | [Written] | {'test': 2000} | {'test': 39.4} | -| [TbilisiCityHallBitextMining](https://huggingface.co/datasets/jupyterjazz/tbilisi-city-hall-titles) | ['eng', 'kat'] | BitextMining | s2s | [News, Written] | {'test': 1820} | {'test': 78} | -| [TelemarketingSalesRuleLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 47} | {'test': 348.29} | -| [TeluguAndhraJyotiNewsClassification](https://github.com/AnushaMotamarri/Telugu-Newspaper-Article-Dataset) | ['tel'] | Classification | s2s | [News, Written] | {'test': 4329} | {'test': 1428.28} | -| [TempReasonL1](https://github.com/DAMO-NLP-SG/TempReason) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | {'test': 4000} | {'test': {'average_document_length': 8.989843250159948, 'average_query_length': 50.22375, 'num_documents': 12504, 'num_queries': 4000, 'average_relevant_docs_per_query': 1.0}} | -| [TempReasonL2Context](https://github.com/DAMO-NLP-SG/TempReason) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | {'test': 0} | {'test': {'average_document_length': 19.823525685690758, 'average_query_length': 11919.25792106726, 'num_documents': 15787, 'num_queries': 5397, 'average_relevant_docs_per_query': 1.0}} | -| [TempReasonL2Fact](https://github.com/DAMO-NLP-SG/TempReason) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | {'test': 5397} | {'test': {'average_document_length': 19.823525685690758, 'average_query_length': 830.7268853066519, 'num_documents': 15787, 'num_queries': 5397, 'average_relevant_docs_per_query': 1.0}} | -| [TempReasonL2Pure](https://github.com/DAMO-NLP-SG/TempReason) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | {'test': 5397} | {'test': {'average_document_length': 19.823525685690758, 'average_query_length': 55.94089308875301, 'num_documents': 15787, 'num_queries': 5397, 'average_relevant_docs_per_query': 1.0}} | -| [TempReasonL3Context](https://github.com/DAMO-NLP-SG/TempReason) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | {'test': 4426} | {'test': {'average_document_length': 19.80534984678243, 'average_query_length': 13424.633077270673, 'num_documents': 15664, 'num_queries': 4426, 'average_relevant_docs_per_query': 1.0}} | -| [TempReasonL3Fact](https://github.com/DAMO-NLP-SG/TempReason) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | {'test': 4426} | {'test': {'average_document_length': 19.80534984678243, 'average_query_length': 896.0754631721645, 'num_documents': 15664, 'num_queries': 4426, 'average_relevant_docs_per_query': 1.0}} | -| [TempReasonL3Pure](https://github.com/DAMO-NLP-SG/TempReason) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | {'test': 4426} | {'test': {'average_document_length': 19.80534984678243, 'average_query_length': 74.44012652507908, 'num_documents': 15664, 'num_queries': 4426, 'average_relevant_docs_per_query': 1.0}} | -| [TenKGnadClassification](https://tblock.github.io/10kGNAD/) | ['deu'] | Classification | p2p | [News, Written] | {'test': 1028} | {'test': 2627.31} | -| [TenKGnadClusteringP2P.v2](https://tblock.github.io/10kGNAD/) | ['deu'] | Clustering | p2p | [News, Non-fiction, Written] | {'test': 10275} | {'test': 2641.03} | -| [TenKGnadClusteringS2S.v2](https://tblock.github.io/10kGNAD/) | ['deu'] | Clustering | s2s | [News, Non-fiction, Written] | {'test': 10267} | {'test': 50.96} | -| [TextualismToolDictionariesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 107} | {'test': 943.23} | -| [TextualismToolPlainLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 165} | {'test': 997.97} | -| [ThuNewsClusteringP2P.v2](http://thuctc.thunlp.org/) (Sun et al., 2016) | ['cmn'] | Clustering | p2p | [News, Written] | {'test': 2048} | {} | -| [ThuNewsClusteringS2S.v2](http://thuctc.thunlp.org/) (Sun et al., 2016) | ['cmn'] | Clustering | s2s | [News, Written] | {'test': 2048} | {} | -| [TopiOCQA](https://mcgill-nlp.github.io/topiocqa) (Vaibhav Adlakha, 2022) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | {'dev': 2514} | {'validation': {'average_document_length': 478.8968086416064, 'average_query_length': 12.579952267303103, 'num_documents': 25700592, 'num_queries': 2514, 'average_relevant_docs_per_query': 1.0}} | -| [TopiOCQAHardNegatives](https://mcgill-nlp.github.io/topiocqa) (Vaibhav Adlakha, 2022) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | {'test': 1000} | {'validation': {'average_document_length': 538.7586536643946, 'average_query_length': 12.85, 'num_documents': 89933, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}} | -| [Touche2020Retrieval.v3](https://github.com/castorini/touche-error-analysis) | ['eng'] | Retrieval | s2p | [Academic] | | | -| [ToxicChatClassification](https://aclanthology.org/2023.findings-emnlp.311/) (Zi Lin, 2023) | ['eng'] | Classification | s2s | [Constructed, Written] | {'test': 1427} | {'test': 189.4} | -| [ToxicConversationsClassification](https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/overview) (cjadams, 2019) | ['eng'] | Classification | s2s | [Social, Written] | {'test': 50000} | {'test': 296.6} | -| [TswanaNewsClassification](https://link.springer.com/chapter/10.1007/978-3-031-49002-6_17) (Vukosi Marivate, 2023) | ['tsn'] | Classification | s2s | [News, Written] | {'validation': 487, 'test': 487} | {'validation': 2417.72, 'test': 2369.52} | -| [TurHistQuadRetrieval](https://github.com/okanvk/Turkish-Reading-Comprehension-Question-Answering-Dataset) (Soygazi et al., 2021) | ['tur'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Academic, Written] | {'test': 1330} | {'test': {'average_document_length': 172.12118713932398, 'average_query_length': 62.5302734375, 'num_documents': 1213, 'num_queries': 1024, 'average_relevant_docs_per_query': 2.0}} | -| [TurkicClassification](https://huggingface.co/datasets/Electrotubbie/classification_Turkic_languages/) | ['bak', 'kaz', 'kir'] | Classification | s2s | [News, Written] | {'train': 193056} | {'train': 1103.13} | -| [TurkishMovieSentimentClassification](https://www.win.tue.nl/~mpechen/publications/pubs/MT_WISDOM2013.pdf) (Erkin Demirtas, 2013) | ['tur'] | Classification | s2s | [Reviews, Written] | {'test': 2644} | {'test': 141.5} | -| [TurkishProductSentimentClassification](https://www.win.tue.nl/~mpechen/publications/pubs/MT_WISDOM2013.pdf) (Erkin Demirtas, 2013) | ['tur'] | Classification | s2s | [Reviews, Written] | {'test': 800} | {'test': 246.85} | -| [TweetEmotionClassification](https://link.springer.com/chapter/10.1007/978-3-319-77116-8_8) (Al-Khatib et al., 2018) | ['ara'] | Classification | s2s | [Social, Written] | {'train': 2048} | {'train': 78.8} | -| [TweetSarcasmClassification](https://aclanthology.org/2020.osact-1.5/) | ['ara'] | Classification | s2s | [Social, Written] | {'test': 2110} | {'test': 102.1} | -| [TweetSentimentClassification](https://aclanthology.org/2022.lrec-1.27) | ['ara', 'deu', 'eng', 'fra', 'hin', 'ita', 'por', 'spa'] | Classification | s2s | [Social, Written] | {'test': 2048} | {'test': 83.51} | -| [TweetSentimentExtractionClassification](https://www.kaggle.com/competitions/tweet-sentiment-extraction/overview) (Maggie et al., 2020) | ['eng'] | Classification | s2s | [Social, Written] | {'test': 3534} | {'test': 67.8} | -| [TweetTopicSingleClassification](https://arxiv.org/abs/2209.09824) | ['eng'] | Classification | s2s | [Social, News, Written] | {'test_2021': 1693} | {'test_2021': 167.66} | -| [TwentyNewsgroupsClustering.v2](https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html) (Ken Lang, 1995) | ['eng'] | Clustering | s2s | [News, Written] | {'test': 2381} | {'test': 32.0} | -| [TwitterHjerneRetrieval](https://huggingface.co/datasets/sorenmulli/da-hashtag-twitterhjerne) (Holm et al., 2024) | ['dan'] | Retrieval | p2p | [Social, Written] | {'train': 340} | {'train': {'average_document_length': 128.85114503816794, 'average_query_length': 166.3846153846154, 'num_documents': 262, 'num_queries': 78, 'average_relevant_docs_per_query': 3.358974358974359}} | -| [TwitterSemEval2015](https://alt.qcri.org/semeval2015/task1/) | ['eng'] | PairClassification | s2s | | {'test': 16777} | {'test': 38.3} | -| [TwitterURLCorpus](https://languagenet.github.io/) | ['eng'] | PairClassification | s2s | | {'test': 51534} | {'test': {'num_samples': 51534, 'avg_sentence1_len': 79.48919160166103, 'avg_sentence2_len': 88.5540419916948, 'unique_labels': 2, 'labels': {'0': {'count': 38546}, '1': {'count': 12988}}}} | -| [UCCVCommonLawLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 94} | {'test': 114.127} | -| [UkrFormalityClassification](https://huggingface.co/datasets/ukr-detect/ukr-formality-dataset-translated-gyafc) | ['ukr'] | Classification | s2s | [News, Written] | {'train': 2048, 'test': 2048} | {'train': 52.1, 'test': 53.07} | -| [UnfairTOSLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 2048} | {'test': 184.69} | -| [UrduRomanSentimentClassification](https://archive.ics.uci.edu/dataset/458/roman+urdu+data+set) (Sharf,Zareen, 2018) | ['urd'] | Classification | s2s | [Social, Written] | {'train': 2048} | {'train': 68.248} | -| [VGHierarchicalClusteringP2P](https://huggingface.co/datasets/navjordj/VG_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | p2p | [News, Non-fiction, Written] | {'test': 2048} | {'test': 2670.3243084794544} | -| [VGHierarchicalClusteringS2S](https://huggingface.co/datasets/navjordj/VG_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | p2p | [News, Non-fiction, Written] | {'test': 2048} | {'test': 139.31247668283325} | -| [VideoRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | {'dev': {'average_document_length': 31.048855642524522, 'average_query_length': 7.365, 'num_documents': 100930, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}} | -| [VieMedEVBitextMining](https://aclanthology.org/2015.iwslt-evaluation.11/) (Nhu Vo, 2024) | ['eng', 'vie'] | BitextMining | s2s | [Medical, Written] | {'test': 2048} | {'test': 139.23} | -| [VieQuADRetrieval](https://aclanthology.org/2020.coling-main.233.pdf) | ['vie'] | Retrieval | s2p | [Encyclopaedic, Non-fiction, Written] | {'validation': 2048} | {'validation': {'average_document_length': 222.61244979919678, 'average_query_length': 65.51513671875, 'num_documents': 2490, 'num_queries': 2048, 'average_relevant_docs_per_query': 2.0}} | -| [VieStudentFeedbackClassification](https://ieeexplore.ieee.org/document/8573337) (Nguyen et al., 2018) | ['vie'] | Classification | s2s | [Reviews, Written] | {'test': 2048} | {'test': 14.22} | -| [VoyageMMarcoReranking](https://arxiv.org/abs/2312.16144) (Benjamin Claviรฉ, 2023) | ['jpn'] | Reranking | s2s | [Academic, Non-fiction, Written] | {'test': 2048} | {'test': 162} | -| [WRIMEClassification](https://aclanthology.org/2021.naacl-main.169/) | ['jpn'] | Classification | s2s | [Social, Written] | {'test': 2048} | {'test': 47.78} | -======= -| [TRECCOVID](https://ir.nist.gov/covidSubmit/index.html) (Kirk Roberts, 2021) | ['eng'] | Retrieval | s2p | [Medical, Academic, Written] | None | None | +| [TRECCOVID](https://ir.nist.gov/covidSubmit/index.html) (Kirk Roberts, 2021) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | +| [TRECCOVID-Fa](https://huggingface.co/datasets/MCINext/trec-covid-fa) | ['fas'] | Retrieval | s2p | [Medical] | None | None | | [TRECCOVID-PL](https://ir.nist.gov/covidSubmit/index.html) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Academic, Medical, Non-fiction, Written] | None | None | +| [TUBerlinT2IRetrieval](https://dl.acm.org/doi/pdf/10.1145/2185520.2185540?casa_token=tq-eUx5UROYAAAAA:_694nPzE7tali6LCkxQc0M-mlo9xslasPMcVnFPMy9tDfvt7lg7p1RTe-k8VWCjuv9gmkQqasKUZ) (Eitz et al., 2012) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | None | None | | [TV2Nordretrieval](https://huggingface.co/datasets/alexandrainst/nordjylland-news-summarization) | ['dan'] | Retrieval | p2p | [News, Non-fiction, Written] | None | None | | [TamilNewsClassification](https://github.com/vanangamudi/tamil-news-classification) (Anoop Kunchukuttan, 2020) | ['tam'] | Classification | s2s | [News, Written] | None | None | | [Tatoeba](https://github.com/facebookresearch/LASER/tree/main/data/tatoeba/v1) (Tatoeba community, 2021) | ['afr', 'amh', 'ang', 'ara', 'arq', 'arz', 'ast', 'awa', 'aze', 'bel', 'ben', 'ber', 'bos', 'bre', 'bul', 'cat', 'cbk', 'ceb', 'ces', 'cha', 'cmn', 'cor', 'csb', 'cym', 'dan', 'deu', 'dsb', 'dtp', 'ell', 'eng', 'epo', 'est', 'eus', 'fao', 'fin', 'fra', 'fry', 'gla', 'gle', 'glg', 'gsw', 'heb', 'hin', 'hrv', 'hsb', 'hun', 'hye', 'ido', 'ile', 'ina', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kat', 'kaz', 'khm', 'kor', 'kur', 'kzj', 'lat', 'lfn', 'lit', 'lvs', 'mal', 'mar', 'max', 'mhr', 'mkd', 'mon', 'nds', 'nld', 'nno', 'nob', 'nov', 'oci', 'orv', 'pam', 'pes', 'pms', 'pol', 'por', 'ron', 'rus', 'slk', 'slv', 'spa', 'sqi', 'srp', 'swe', 'swg', 'swh', 'tam', 'tat', 'tel', 'tgl', 'tha', 'tuk', 'tur', 'tzl', 'uig', 'ukr', 'urd', 'uzb', 'vie', 'war', 'wuu', 'xho', 'yid', 'yue', 'zsm'] | BitextMining | s2s | [Written] | None | None | @@ -1030,13 +762,15 @@ The following tables give you an overview of the tasks in MTEB. | [TextualismToolPlainLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [ThuNewsClusteringP2P.v2](http://thuctc.thunlp.org/) (Sun et al., 2016) | ['cmn'] | Clustering | p2p | [News, Written] | None | None | | [ThuNewsClusteringS2S.v2](http://thuctc.thunlp.org/) (Sun et al., 2016) | ['cmn'] | Clustering | s2s | [News, Written] | None | None | +| [TinyImageNetClustering](https://huggingface.co/datasets/zh-plus/tiny-imagenet/viewer/default/valid) | ['eng'] | ImageClustering | i2i | [Reviews] | None | None | | [TopiOCQA](https://mcgill-nlp.github.io/topiocqa) (Vaibhav Adlakha, 2022) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [TopiOCQAHardNegatives](https://mcgill-nlp.github.io/topiocqa) (Vaibhav Adlakha, 2022) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [Touche2020-Fa](https://huggingface.co/datasets/MCINext/touche2020-fa) | ['fas'] | Retrieval | s2p | [Spoken] | None | None | | [Touche2020Retrieval.v3](https://github.com/castorini/touche-error-analysis) | ['eng'] | Retrieval | s2p | [Academic] | {'test': 303781} | {'test': {'number_of_characters': 637047138, 'num_samples': 303781, 'num_queries': 49, 'num_documents': 303732, 'min_document_length': 16, 'average_document_length': 0.01, 'max_document_length': 83, 'unique_documents': 303732, 'min_query_length': 41, 'average_query_length': 13000918.57, 'max_query_length': 105983, 'unique_queries': 49, 'min_relevant_docs_per_query': 40, 'average_relevant_docs_per_query': 58.14, 'max_relevant_docs_per_query': 87, 'unique_relevant_docs': 2732}} | | [ToxicChatClassification](https://aclanthology.org/2023.findings-emnlp.311/) (Zi Lin, 2023) | ['eng'] | Classification | s2s | [Constructed, Written] | None | None | | [ToxicConversationsClassification](https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/overview) (cjadams, 2019) | ['eng'] | Classification | s2s | [Social, Written] | None | None | | [TswanaNewsClassification](https://link.springer.com/chapter/10.1007/978-3-031-49002-6_17) (Vukosi Marivate, 2023) | ['tsn'] | Classification | s2s | [News, Written] | None | None | -| [TurHistQuadRetrieval](https://github.com/okanvk/Turkish-Reading-Comprehension-Question-Answering-Dataset) (Soygazi et al., 2021) | ['tur'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Academic, Written] | None | None | +| [TurHistQuadRetrieval](https://github.com/okanvk/Turkish-Reading-Comprehension-Question-Answering-Dataset) (Soygazi et al., 2021) | ['tur'] | Retrieval | p2p | [Academic, Encyclopaedic, Non-fiction, Written] | None | None | | [TurkicClassification](https://huggingface.co/datasets/Electrotubbie/classification_Turkic_languages/) | ['bak', 'kaz', 'kir'] | Classification | s2s | [News, Written] | None | None | | [TurkishMovieSentimentClassification](https://www.win.tue.nl/~mpechen/publications/pubs/MT_WISDOM2013.pdf) (Erkin Demirtas, 2013) | ['tur'] | Classification | s2s | [Reviews, Written] | None | None | | [TurkishProductSentimentClassification](https://www.win.tue.nl/~mpechen/publications/pubs/MT_WISDOM2013.pdf) (Erkin Demirtas, 2013) | ['tur'] | Classification | s2s | [Reviews, Written] | None | None | @@ -1044,64 +778,83 @@ The following tables give you an overview of the tasks in MTEB. | [TweetSarcasmClassification](https://aclanthology.org/2020.osact-1.5/) | ['ara'] | Classification | s2s | [Social, Written] | None | None | | [TweetSentimentClassification](https://aclanthology.org/2022.lrec-1.27) | ['ara', 'deu', 'eng', 'fra', 'hin', 'ita', 'por', 'spa'] | Classification | s2s | [Social, Written] | None | None | | [TweetSentimentExtractionClassification](https://www.kaggle.com/competitions/tweet-sentiment-extraction/overview) (Maggie et al., 2020) | ['eng'] | Classification | s2s | [Social, Written] | None | None | -| [TweetTopicSingleClassification](https://arxiv.org/abs/2209.09824) | ['eng'] | Classification | s2s | [Social, News, Written] | None | None | +| [TweetTopicSingleClassification](https://arxiv.org/abs/2209.09824) | ['eng'] | Classification | s2s | [News, Social, Written] | None | None | | [TwentyNewsgroupsClustering.v2](https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html) (Ken Lang, 1995) | ['eng'] | Clustering | s2s | [News, Written] | {'test': 59545} | {'test': {'num_samples': 59545, 'number_of_characters': 1907719, 'min_text_length': 11, 'average_text_length': 32.04, 'max_text_length': 120, 'min_labels_per_text': 2082, 'average_labels_per_text': 1.0, 'max_labels_per_text': 3236, 'unique_labels': 20, 'labels': {'12': {'count': 3137}, '6': {'count': 3070}, '0': {'count': 2613}, '2': {'count': 3155}, '10': {'count': 3220}, '17': {'count': 2986}, '14': {'count': 3106}, '13': {'count': 3055}, '1': {'count': 3056}, '16': {'count': 2911}, '9': {'count': 2984}, '3': {'count': 3070}, '15': {'count': 3090}, '7': {'count': 3036}, '5': {'count': 3124}, '11': {'count': 3236}, '18': {'count': 2483}, '8': {'count': 3090}, '19': {'count': 2082}, '4': {'count': 3041}}}} | | [TwitterHjerneRetrieval](https://huggingface.co/datasets/sorenmulli/da-hashtag-twitterhjerne) (Holm et al., 2024) | ['dan'] | Retrieval | p2p | [Social, Written] | None | None | -| [TwitterSemEval2015](https://alt.qcri.org/semeval2015/task1/) | ['eng'] | PairClassification | s2s | | None | None | -| [TwitterURLCorpus](https://languagenet.github.io/) | ['eng'] | PairClassification | s2s | | {'test': 51534} | {'test': {'num_samples': 51534, 'number_of_characters': 8659940, 'min_sentence1_length': 24, 'avg_sentence1_length': 79.49, 'max_sentence1_length': 126, 'unique_sentence1': 4329, 'min_sentence2_length': 6, 'avg_sentence2_length': 88.55, 'max_sentence2_length': 608, 'unique_sentence2': 41304, 'unique_labels': 2, 'labels': {'0': {'count': 38546}, '1': {'count': 12988}}}} | +| [TwitterSemEval2015](https://alt.qcri.org/semeval2015/task1/) | ['eng'] | PairClassification | s2s | [Social, Written] | None | None | +| [TwitterURLCorpus](https://languagenet.github.io/) | ['eng'] | PairClassification | s2s | [Social, Written] | {'test': 51534} | {'test': {'num_samples': 51534, 'number_of_characters': 8659940, 'min_sentence1_length': 24, 'avg_sentence1_length': 79.49, 'max_sentence1_length': 126, 'unique_sentence1': 4329, 'min_sentence2_length': 6, 'avg_sentence2_length': 88.55, 'max_sentence2_length': 608, 'unique_sentence2': 41304, 'unique_labels': 2, 'labels': {'0': {'count': 38546}, '1': {'count': 12988}}}} | | [UCCVCommonLawLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [UCF101](https://huggingface.co/datasets/flwrlabs/ucf101) (Khurram Soomro, 2012) | ['eng'] | ImageClassification | i2i | [Scene] | None | None | +| [UCF101ZeroShot](https://huggingface.co/datasets/flwrlabs/ucf101) (Khurram Soomro, 2012) | ['eng'] | ZeroShotClassification | i2t | [Scene] | None | None | | [UkrFormalityClassification](https://huggingface.co/datasets/ukr-detect/ukr-formality-dataset-translated-gyafc) | ['ukr'] | Classification | s2s | [News, Written] | None | None | | [UnfairTOSLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [UrduRomanSentimentClassification](https://archive.ics.uci.edu/dataset/458/roman+urdu+data+set) (Sharf,Zareen, 2018) | ['urd'] | Classification | s2s | [Social, Written] | None | None | | [VGHierarchicalClusteringP2P](https://huggingface.co/datasets/navjordj/VG_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | p2p | [News, Non-fiction, Written] | None | None | | [VGHierarchicalClusteringS2S](https://huggingface.co/datasets/navjordj/VG_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | p2p | [News, Non-fiction, Written] | None | None | +| [VOC2007](http://host.robots.ox.ac.uk/pascal/VOC/) | ['eng'] | ImageMultilabelClassification | i2i | [Encyclopaedic] | None | None | +| [VQA2IT2TRetrieval](https://openaccess.thecvf.com/content_cvpr_2017/html/Goyal_Making_the_v_CVPR_2017_paper.html) (Goyal et al., 2017) | ['eng'] | Any2AnyRetrieval | it2t | [Web] | None | None | | [VideoRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | None | +| [VidoreArxivQARetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None | +| [VidoreDocVQARetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None | +| [VidoreInfoVQARetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None | +| [VidoreShiftProjectRetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None | +| [VidoreSyntheticDocQAAIRetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None | +| [VidoreSyntheticDocQAEnergyRetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None | +| [VidoreSyntheticDocQAGovernmentReportsRetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None | +| [VidoreSyntheticDocQAHealthcareIndustryRetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None | +| [VidoreTabfquadRetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None | +| [VidoreTatdqaRetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None | | [VieMedEVBitextMining](https://aclanthology.org/2015.iwslt-evaluation.11/) (Nhu Vo, 2024) | ['eng', 'vie'] | BitextMining | s2s | [Medical, Written] | {'test': 2048} | {'test': {'num_samples': 2048, 'number_of_characters': 575910, 'unique_pairs': 2048, 'min_sentence1_length': 11, 'average_sentence1_length': 139.23, 'max_sentence1_length': 1291, 'unique_sentence1': 2048, 'min_sentence2_length': 11, 'average_sentence2_length': 141.98, 'max_sentence2_length': 1217, 'unique_sentence2': 2047}} | | [VieQuADRetrieval](https://aclanthology.org/2020.coling-main.233.pdf) | ['vie'] | Retrieval | s2p | [Encyclopaedic, Non-fiction, Written] | None | None | | [VieStudentFeedbackClassification](https://ieeexplore.ieee.org/document/8573337) (Nguyen et al., 2018) | ['vie'] | Classification | s2s | [Reviews, Written] | None | None | +| [VisualNewsI2TRetrieval](https://aclanthology.org/2021.emnlp-main.542/) (Liu et al., 2021) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | None | None | +| [VisualNewsT2IRetrieval](https://aclanthology.org/2021.emnlp-main.542/) (Liu et al., 2021) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | None | None | +| [VizWizIT2TRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/papers/Gurari_VizWiz_Grand_Challenge_CVPR_2018_paper.pdf) (Gurari et al., 2018) | ['eng'] | Any2AnyRetrieval | it2t | [Web] | None | None | | [VoyageMMarcoReranking](https://arxiv.org/abs/2312.16144) (Benjamin Claviรฉ, 2023) | ['jpn'] | Reranking | s2s | [Academic, Non-fiction, Written] | None | None | +| [WITT2IRetrieval](https://proceedings.mlr.press/v162/bugliarello22a/bugliarello22a.pdf) (Bugliarello et al., 2022) | ['ara', 'bul', 'dan', 'ell', 'eng', 'est', 'ind', 'jpn', 'kor', 'tur', 'vie'] | Any2AnyRetrieval | t2i | [Encyclopaedic, Written] | None | None | | [WRIMEClassification](https://aclanthology.org/2021.naacl-main.169/) | ['jpn'] | Classification | s2s | [Social, Written] | None | None | ->>>>>>> main | [Waimai](https://aclanthology.org/2023.nodalida-1.20/) (Xiao et al., 2023) | ['cmn'] | Classification | s2s | | None | None | | [WebLINXCandidatesReranking](https://mcgill-nlp.github.io/weblinx) (Xing Han Lรน, 2024) | ['eng'] | Reranking | p2p | [Academic, Web, Written] | None | None | +| [WebQAT2ITRetrieval](https://openaccess.thecvf.com/content/CVPR2022/html/Chang_WebQA_Multihop_and_Multimodal_QA_CVPR_2022_paper.html) (Chang et al., 2022) | ['eng'] | Any2AnyRetrieval | t2it | [Encyclopaedic] | None | None | +| [WebQAT2TRetrieval](https://openaccess.thecvf.com/content/CVPR2022/html/Chang_WebQA_Multihop_and_Multimodal_QA_CVPR_2022_paper.html) (Chang et al., 2022) | ['eng'] | Any2AnyRetrieval | t2t | [Encyclopaedic] | None | None | | [WikiCitiesClustering](https://huggingface.co/datasets/wikipedia) | ['eng'] | Clustering | p2p | [Encyclopaedic, Written] | None | None | -<<<<<<< HEAD -| [WikiClusteringP2P.v2](https://github.com/Rysias/wiki-clustering) | ['bos', 'cat', 'ces', 'dan', 'eus', 'glv', 'ilo', 'kur', 'lav', 'min', 'mlt', 'sco', 'sqi', 'wln'] | Clustering | p2p | [Encyclopaedic, Written] | {'test': 2048} | {'test': {'num_samples': 28672, 'average_text_length': 629.7426409040179, 'average_labels_per_text': 1.0, 'unique_labels': 39, 'labels': {'16': {'count': 541}, '3': {'count': 1607}, '12': {'count': 846}, '0': {'count': 2410}, '15': {'count': 878}, '11': {'count': 864}, '6': {'count': 787}, '9': {'count': 654}, '14': {'count': 966}, '8': {'count': 1389}, '2': {'count': 2428}, '10': {'count': 839}, '1': {'count': 1370}, '4': {'count': 2942}, '7': {'count': 2514}, '5': {'count': 1490}, '13': {'count': 918}, '19': {'count': 315}, '17': {'count': 711}, '20': {'count': 345}, '18': {'count': 800}, '24': {'count': 467}, '25': {'count': 928}, '21': {'count': 62}, '26': {'count': 270}, '22': {'count': 186}, '23': {'count': 36}, '27': {'count': 465}, '28': {'count': 62}, '36': {'count': 139}, '32': {'count': 57}, '38': {'count': 43}, '30': {'count': 52}, '34': {'count': 80}, '33': {'count': 75}, '35': {'count': 62}, '31': {'count': 63}, '37': {'count': 8}, '29': {'count': 3}}, 'hf_subset_descriptive_stats': {'bs': {'num_samples': 2048, 'average_text_length': 1046.25732421875, 'average_labels_per_text': 1.0, 'unique_labels': 17, 'labels': {'16': {'count': 268}, '3': {'count': 89}, '12': {'count': 597}, '0': {'count': 202}, '15': {'count': 113}, '11': {'count': 11}, '6': {'count': 142}, '9': {'count': 181}, '14': {'count': 179}, '8': {'count': 33}, '2': {'count': 172}, '10': {'count': 12}, '1': {'count': 7}, '4': {'count': 25}, '7': {'count': 6}, '5': {'count': 9}, '13': {'count': 2}}}, 'ca': {'num_samples': 2048, 'average_text_length': 600.73291015625, 'average_labels_per_text': 1.0, 'unique_labels': 8, 'labels': {'6': {'count': 257}, '1': {'count': 737}, '2': {'count': 284}, '4': {'count': 394}, '0': {'count': 162}, '7': {'count': 151}, '5': {'count': 55}, '3': {'count': 8}}}, 'cs': {'num_samples': 2048, 'average_text_length': 659.2294921875, 'average_labels_per_text': 1.0, 'unique_labels': 21, 'labels': {'19': {'count': 35}, '5': {'count': 624}, '17': {'count': 126}, '10': {'count': 155}, '1': {'count': 231}, '7': {'count': 215}, '11': {'count': 128}, '0': {'count': 57}, '13': {'count': 75}, '2': {'count': 83}, '3': {'count': 38}, '9': {'count': 8}, '6': {'count': 14}, '12': {'count': 9}, '16': {'count': 16}, '20': {'count': 73}, '18': {'count': 38}, '4': {'count': 60}, '15': {'count': 14}, '14': {'count': 38}, '8': {'count': 11}}}, 'da': {'num_samples': 2048, 'average_text_length': 767.58935546875, 'average_labels_per_text': 1.0, 'unique_labels': 20, 'labels': {'14': {'count': 212}, '4': {'count': 74}, '15': {'count': 16}, '8': {'count': 165}, '13': {'count': 115}, '0': {'count': 79}, '1': {'count': 34}, '9': {'count': 114}, '7': {'count': 364}, '10': {'count': 32}, '17': {'count': 66}, '18': {'count': 32}, '12': {'count': 129}, '11': {'count': 159}, '2': {'count': 66}, '3': {'count': 185}, '19': {'count': 103}, '16': {'count': 33}, '5': {'count': 56}, '6': {'count': 14}}}, 'eu': {'num_samples': 2048, 'average_text_length': 405.16015625, 'average_labels_per_text': 1.0, 'unique_labels': 5, 'labels': {'4': {'count': 383}, '0': {'count': 995}, '3': {'count': 282}, '2': {'count': 344}, '1': {'count': 44}}}, 'gv': {'num_samples': 2048, 'average_text_length': 368.01123046875, 'average_labels_per_text': 1.0, 'unique_labels': 28, 'labels': {'6': {'count': 32}, '1': {'count': 83}, '24': {'count': 13}, '17': {'count': 152}, '2': {'count': 534}, '25': {'count': 76}, '5': {'count': 198}, '15': {'count': 100}, '21': {'count': 22}, '26': {'count': 188}, '13': {'count': 230}, '20': {'count': 11}, '3': {'count': 107}, '19': {'count': 88}, '16': {'count': 55}, '22': {'count': 29}, '14': {'count': 12}, '8': {'count': 61}, '0': {'count': 5}, '10': {'count': 4}, '4': {'count': 9}, '23': {'count': 6}, '7': {'count': 3}, '9': {'count': 20}, '18': {'count': 4}, '12': {'count': 3}, '27': {'count': 1}, '11': {'count': 2}}}, 'ilo': {'num_samples': 2048, 'average_text_length': 617.90771484375, 'average_labels_per_text': 1.0, 'unique_labels': 29, 'labels': {'3': {'count': 562}, '0': {'count': 373}, '18': {'count': 521}, '8': {'count': 129}, '13': {'count': 123}, '11': {'count': 54}, '25': {'count': 8}, '27': {'count': 5}, '17': {'count': 13}, '15': {'count': 4}, '4': {'count': 28}, '7': {'count': 83}, '10': {'count': 15}, '1': {'count': 11}, '24': {'count': 15}, '14': {'count': 8}, '16': {'count': 4}, '19': {'count': 9}, '23': {'count': 10}, '26': {'count': 4}, '28': {'count': 8}, '12': {'count': 29}, '21': {'count': 12}, '6': {'count': 5}, '20': {'count': 6}, '5': {'count': 4}, '22': {'count': 2}, '9': {'count': 2}, '2': {'count': 1}}}, 'ku': {'num_samples': 2048, 'average_text_length': 421.17333984375, 'average_labels_per_text': 1.0, 'unique_labels': 39, 'labels': {'14': {'count': 14}, '36': {'count': 139}, '20': {'count': 108}, '22': {'count': 27}, '15': {'count': 102}, '32': {'count': 55}, '8': {'count': 431}, '17': {'count': 210}, '38': {'count': 43}, '30': {'count': 51}, '4': {'count': 60}, '2': {'count': 111}, '6': {'count': 95}, '34': {'count': 70}, '27': {'count': 15}, '5': {'count': 174}, '26': {'count': 37}, '0': {'count': 11}, '25': {'count': 50}, '16': {'count': 2}, '12': {'count': 16}, '24': {'count': 2}, '11': {'count': 17}, '21': {'count': 9}, '13': {'count': 20}, '1': {'count': 7}, '33': {'count': 33}, '35': {'count': 28}, '10': {'count': 11}, '31': {'count': 51}, '18': {'count': 4}, '3': {'count': 4}, '28': {'count': 8}, '37': {'count': 8}, '23': {'count': 2}, '19': {'count': 7}, '7': {'count': 6}, '9': {'count': 8}, '29': {'count': 2}}}, 'lv': {'num_samples': 2048, 'average_text_length': 770.67138671875, 'average_labels_per_text': 1.0, 'unique_labels': 16, 'labels': {'15': {'count': 288}, '2': {'count': 110}, '6': {'count': 74}, '12': {'count': 50}, '0': {'count': 171}, '14': {'count': 188}, '10': {'count': 351}, '5': {'count': 142}, '4': {'count': 300}, '13': {'count': 60}, '11': {'count': 48}, '1': {'count': 165}, '8': {'count': 53}, '7': {'count': 5}, '3': {'count': 9}, '9': {'count': 34}}}, 'min': {'num_samples': 2048, 'average_text_length': 631.74072265625, 'average_labels_per_text': 1.0, 'unique_labels': 15, 'labels': {'7': {'count': 1595}, '9': {'count': 9}, '4': {'count': 48}, '3': {'count': 83}, '2': {'count': 160}, '0': {'count': 19}, '5': {'count': 74}, '6': {'count': 12}, '10': {'count': 12}, '13': {'count': 10}, '8': {'count': 5}, '11': {'count': 13}, '12': {'count': 2}, '1': {'count': 5}, '14': {'count': 1}}}, 'mt': {'num_samples': 2048, 'average_text_length': 821.22265625, 'average_labels_per_text': 1.0, 'unique_labels': 27, 'labels': {'12': {'count': 8}, '10': {'count': 147}, '14': {'count': 180}, '17': {'count': 117}, '25': {'count': 654}, '19': {'count': 35}, '0': {'count': 77}, '3': {'count': 12}, '16': {'count': 44}, '15': {'count': 108}, '24': {'count': 267}, '6': {'count': 43}, '26': {'count': 32}, '4': {'count': 79}, '22': {'count': 67}, '9': {'count': 16}, '8': {'count': 16}, '2': {'count': 55}, '5': {'count': 6}, '11': {'count': 30}, '18': {'count': 12}, '21': {'count': 12}, '20': {'count': 15}, '23': {'count': 7}, '13': {'count': 6}, '7': {'count': 1}, '1': {'count': 2}}}, 'sco': {'num_samples': 2048, 'average_text_length': 1065.21044921875, 'average_labels_per_text': 1.0, 'unique_labels': 23, 'labels': {'18': {'count': 178}, '6': {'count': 92}, '9': {'count': 28}, '15': {'count': 106}, '8': {'count': 432}, '2': {'count': 95}, '11': {'count': 104}, '1': {'count': 42}, '13': {'count': 248}, '16': {'count': 118}, '20': {'count': 130}, '3': {'count': 171}, '22': {'count': 57}, '7': {'count': 83}, '10': {'count': 74}, '5': {'count': 6}, '4': {'count': 17}, '17': {'count': 24}, '14': {'count': 14}, '0': {'count': 7}, '19': {'count': 18}, '21': {'count': 3}, '12': {'count': 1}}}, 'sq': {'num_samples': 2048, 'average_text_length': 425.486328125, 'average_labels_per_text': 1.0, 'unique_labels': 36, 'labels': {'27': {'count': 444}, '9': {'count': 234}, '14': {'count': 120}, '0': {'count': 128}, '15': {'count': 27}, '11': {'count': 298}, '24': {'count': 170}, '28': {'count': 46}, '19': {'count': 20}, '25': {'count': 140}, '3': {'count': 47}, '2': {'count': 87}, '35': {'count': 34}, '8': {'count': 53}, '31': {'count': 12}, '17': {'count': 3}, '23': {'count': 11}, '20': {'count': 2}, '33': {'count': 42}, '10': {'count': 26}, '34': {'count': 10}, '7': {'count': 2}, '13': {'count': 29}, '4': {'count': 4}, '6': {'count': 7}, '26': {'count': 9}, '5': {'count': 16}, '30': {'count': 1}, '21': {'count': 4}, '22': {'count': 4}, '18': {'count': 11}, '32': {'count': 2}, '12': {'count': 2}, '16': {'count': 1}, '1': {'count': 1}, '29': {'count': 1}}}, 'wa': {'num_samples': 2048, 'average_text_length': 216.00390625, 'average_labels_per_text': 1.0, 'unique_labels': 6, 'labels': {'5': {'count': 126}, '4': {'count': 1461}, '0': {'count': 124}, '2': {'count': 326}, '3': {'count': 10}, '1': {'count': 1}}}}}} | -| [WikipediaRerankingMultilingual](https://huggingface.co/datasets/ellamind/wikipedia-2023-11-reranking-multilingual) | ['ben', 'bul', 'ces', 'dan', 'deu', 'eng', 'fas', 'fin', 'hin', 'ita', 'nld', 'nor', 'por', 'ron', 'srp', 'swe'] | Reranking | s2p | [Encyclopaedic, Written] | {'en': 1500, 'de': 1500, 'it': 1500, 'pt': 1500, 'nl': 1500, 'cs': 1500, 'ro': 1500, 'bg': 1500, 'sr': 1500, 'fi': 1500, 'da': 1500, 'fa': 1500, 'hi': 1500, 'bn': 1500, 'no': 1500, 'sv': 1500} | {'test': {'num_samples': 24000, 'num_positive': 24000, 'num_negative': 24000, 'avg_query_len': 59.091208333333334, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0, 'hf_subset_descriptive_stats': {'bg': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 60.82666666666667, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'bn': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 47.266666666666666, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'cs': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 56.272, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'da': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 56.75066666666667, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'de': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 70.004, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'en': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 68.372, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'fa': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 48.66733333333333, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'fi': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 55.343333333333334, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'hi': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 50.77733333333333, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'it': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 70.05466666666666, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'nl': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 65.34466666666667, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'pt': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 65.11933333333333, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'ro': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 61.973333333333336, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'sr': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 55.669333333333334, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'no': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 55.288, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'sv': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 57.73, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}}}} | -| [WikipediaRetrievalMultilingual](https://huggingface.co/datasets/ellamind/wikipedia-2023-11-retrieval-multilingual-queries) | ['ben', 'bul', 'ces', 'dan', 'deu', 'eng', 'fas', 'fin', 'hin', 'ita', 'nld', 'nor', 'por', 'ron', 'srp', 'swe'] | Retrieval | s2p | [Encyclopaedic, Written] | {'en': 1500, 'de': 1500, 'it': 1500, 'pt': 1500, 'nl': 1500, 'cs': 1500, 'ro': 1500, 'bg': 1500, 'sr': 1500, 'fi': 1500, 'da': 1500, 'fa': 1500, 'hi': 1500, 'bn': 1500, 'no': 1500, 'sv': 1500} | {'test': {'bg': {'average_document_length': 374.376, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'bn': {'average_document_length': 394.05044444444445, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'cs': {'average_document_length': 369.9831111111111, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'da': {'average_document_length': 345.2597037037037, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'de': {'average_document_length': 398.4137777777778, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'en': {'average_document_length': 452.9871111111111, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'fa': {'average_document_length': 345.1568888888889, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'fi': {'average_document_length': 379.71237037037037, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'hi': {'average_document_length': 410.72540740740743, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'it': {'average_document_length': 393.73437037037036, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'nl': {'average_document_length': 375.6695555555556, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'pt': {'average_document_length': 398.27237037037037, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'ro': {'average_document_length': 348.3817037037037, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'sr': {'average_document_length': 384.3131851851852, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'no': {'average_document_length': 366.93733333333336, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'sv': {'average_document_length': 369.340962962963, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}}} | -| [WinoGrande](https://winogrande.allenai.org/) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | {'test': 0} | {'test': {'average_document_length': 7.68243375858685, 'average_query_length': 111.78216258879242, 'num_documents': 5095, 'num_queries': 1267, 'average_relevant_docs_per_query': 1.0}} | -| [WisesightSentimentClassification](https://github.com/PyThaiNLP/wisesight-sentiment) | ['tha'] | Classification | s2s | [Social, News, Written] | {'train': 2048} | {'train': 103.42} | -| XMarket (Bonab et al., 2021) | ['deu', 'eng', 'spa'] | Retrieval | s2p | | None | {'test': {'de': {'average_document_length': 187.4061197288943, 'average_query_length': 15.717612088184294, 'num_documents': 70526, 'num_queries': 4037, 'average_relevant_docs_per_query': 54.3522417636859}, 'en': {'average_document_length': 452.792089662076, 'average_query_length': 15.881635344543357, 'num_documents': 218777, 'num_queries': 9099, 'average_relevant_docs_per_query': 85.43719090009891}, 'es': {'average_document_length': 279.67909262759923, 'average_query_length': 19.97062937062937, 'num_documents': 39675, 'num_queries': 3575, 'average_relevant_docs_per_query': 36.01006993006993}}} | -| [XNLI](https://aclanthology.org/D18-1269/) (Conneau et al., 2018) | ['ara', 'bul', 'deu', 'ell', 'eng', 'fra', 'hin', 'rus', 'spa', 'swa', 'tha', 'tur', 'vie', 'zho'] | PairClassification | s2s | [Non-fiction, Fiction, Government, Written] | {'validation': 2163, 'test': 2460} | {'test': {'num_samples': 19110, 'avg_sentence1_len': 103.23793825222397, 'avg_sentence2_len': 48.88895866038723, 'unique_labels': 2, 'labels': {'0': {'count': 9562}, '1': {'count': 9548}}, 'hf_subset_descriptive_stats': {'ar': {'num_samples': 1365, 'avg_sentence1_len': 89.57362637362637, 'avg_sentence2_len': 41.99487179487179, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'bg': {'num_samples': 1365, 'avg_sentence1_len': 110.01611721611722, 'avg_sentence2_len': 51.62930402930403, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'de': {'num_samples': 1365, 'avg_sentence1_len': 119.92600732600732, 'avg_sentence2_len': 56.794871794871796, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'el': {'num_samples': 1365, 'avg_sentence1_len': 119.05421245421246, 'avg_sentence2_len': 56.93260073260073, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'en': {'num_samples': 1365, 'avg_sentence1_len': 105.67032967032966, 'avg_sentence2_len': 49.8043956043956, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'es': {'num_samples': 1365, 'avg_sentence1_len': 115.43296703296703, 'avg_sentence2_len': 54.68205128205128, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'fr': {'num_samples': 1365, 'avg_sentence1_len': 121.0967032967033, 'avg_sentence2_len': 58.58021978021978, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'hi': {'num_samples': 1365, 'avg_sentence1_len': 104.63443223443224, 'avg_sentence2_len': 50.17289377289377, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'ru': {'num_samples': 1365, 'avg_sentence1_len': 110.76923076923077, 'avg_sentence2_len': 52.452014652014654, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'sw': {'num_samples': 1365, 'avg_sentence1_len': 104.43956043956044, 'avg_sentence2_len': 49.48205128205128, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'th': {'num_samples': 1365, 'avg_sentence1_len': 96.6923076923077, 'avg_sentence2_len': 44.544322344322346, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'tr': {'num_samples': 1365, 'avg_sentence1_len': 103.67765567765568, 'avg_sentence2_len': 49.18534798534799, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'vi': {'num_samples': 1365, 'avg_sentence1_len': 111.31208791208792, 'avg_sentence2_len': 52.46007326007326, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'zh': {'num_samples': 1365, 'avg_sentence1_len': 33.03589743589744, 'avg_sentence2_len': 15.73040293040293, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}}}, 'validation': {'num_samples': 19110, 'avg_sentence1_len': 103.20790162218734, 'avg_sentence2_len': 49.01909994767138, 'unique_labels': 2, 'labels': {'0': {'count': 9562}, '1': {'count': 9548}}, 'hf_subset_descriptive_stats': {'ar': {'num_samples': 1365, 'avg_sentence1_len': 88.31868131868131, 'avg_sentence2_len': 41.61172161172161, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'bg': {'num_samples': 1365, 'avg_sentence1_len': 109.196336996337, 'avg_sentence2_len': 51.967032967032964, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'de': {'num_samples': 1365, 'avg_sentence1_len': 119.81172161172161, 'avg_sentence2_len': 57.36923076923077, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'el': {'num_samples': 1365, 'avg_sentence1_len': 119.87545787545787, 'avg_sentence2_len': 56.88278388278388, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'en': {'num_samples': 1365, 'avg_sentence1_len': 105.71648351648352, 'avg_sentence2_len': 49.87619047619047, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'es': {'num_samples': 1365, 'avg_sentence1_len': 115.17289377289377, 'avg_sentence2_len': 55.120879120879124, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'fr': {'num_samples': 1365, 'avg_sentence1_len': 121.75897435897436, 'avg_sentence2_len': 59.08864468864469, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'hi': {'num_samples': 1365, 'avg_sentence1_len': 105.06446886446886, 'avg_sentence2_len': 50.44395604395604, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'ru': {'num_samples': 1365, 'avg_sentence1_len': 109.74725274725274, 'avg_sentence2_len': 52.26886446886447, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'sw': {'num_samples': 1365, 'avg_sentence1_len': 104.32234432234432, 'avg_sentence2_len': 49.87692307692308, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'th': {'num_samples': 1365, 'avg_sentence1_len': 97.28498168498169, 'avg_sentence2_len': 43.843223443223444, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'tr': {'num_samples': 1365, 'avg_sentence1_len': 102.96630036630036, 'avg_sentence2_len': 49.63809523809524, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'vi': {'num_samples': 1365, 'avg_sentence1_len': 112.26373626373626, 'avg_sentence2_len': 52.432967032967035, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'zh': {'num_samples': 1365, 'avg_sentence1_len': 33.41098901098901, 'avg_sentence2_len': 15.846886446886447, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}}}} | -| [XNLIV2](https://arxiv.org/pdf/2301.06527) (Upadhyay et al., 2023) | ['asm', 'ben', 'bho', 'ell', 'guj', 'kan', 'mar', 'ory', 'pan', 'rus', 'san', 'tam', 'tur'] | PairClassification | s2s | [Non-fiction, Fiction, Government, Written] | {'test': 5010} | {'test': 80.06} | -| [XPQARetrieval](https://arxiv.org/abs/2305.09249) (Shen et al., 2023) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'jpn', 'kor', 'pol', 'por', 'spa', 'tam'] | Retrieval | s2p | [Reviews, Written] | {'test': 19801} | {'test': {'ara-ara': {'average_document_length': 61.88361204013378, 'average_query_length': 29.688, 'num_documents': 1495, 'num_queries': 750, 'average_relevant_docs_per_query': 2.004}, 'eng-ara': {'average_document_length': 125.26940639269407, 'average_query_length': 29.688, 'num_documents': 1533, 'num_queries': 750, 'average_relevant_docs_per_query': 2.058666666666667}, 'ara-eng': {'average_document_length': 61.88361204013378, 'average_query_length': 39.5188679245283, 'num_documents': 1495, 'num_queries': 742, 'average_relevant_docs_per_query': 2.024258760107817}, 'deu-deu': {'average_document_length': 69.54807692307692, 'average_query_length': 55.51827676240209, 'num_documents': 1248, 'num_queries': 766, 'average_relevant_docs_per_query': 1.6318537859007833}, 'eng-deu': {'average_document_length': 115.77118078719145, 'average_query_length': 55.51827676240209, 'num_documents': 1499, 'num_queries': 766, 'average_relevant_docs_per_query': 1.9634464751958225}, 'deu-eng': {'average_document_length': 69.54807692307692, 'average_query_length': 51.88903394255875, 'num_documents': 1248, 'num_queries': 766, 'average_relevant_docs_per_query': 1.6318537859007833}, 'spa-spa': {'average_document_length': 68.27511591962906, 'average_query_length': 46.711223203026485, 'num_documents': 1941, 'num_queries': 793, 'average_relevant_docs_per_query': 2.4489281210592684}, 'eng-spa': {'average_document_length': 123.43698347107438, 'average_query_length': 46.711223203026485, 'num_documents': 1936, 'num_queries': 793, 'average_relevant_docs_per_query': 2.472887767969735}, 'spa-eng': {'average_document_length': 68.27511591962906, 'average_query_length': 47.21059268600252, 'num_documents': 1941, 'num_queries': 793, 'average_relevant_docs_per_query': 2.4489281210592684}, 'fra-fra': {'average_document_length': 76.99354005167959, 'average_query_length': 56.0520694259012, 'num_documents': 1548, 'num_queries': 749, 'average_relevant_docs_per_query': 2.069425901201602}, 'eng-fra': {'average_document_length': 137.31242532855435, 'average_query_length': 56.0520694259012, 'num_documents': 1674, 'num_queries': 749, 'average_relevant_docs_per_query': 2.248331108144192}, 'fra-eng': {'average_document_length': 76.99354005167959, 'average_query_length': 49.58744993324433, 'num_documents': 1548, 'num_queries': 749, 'average_relevant_docs_per_query': 2.069425901201602}, 'hin-hin': {'average_document_length': 47.20783373301359, 'average_query_length': 33.47783783783784, 'num_documents': 1251, 'num_queries': 925, 'average_relevant_docs_per_query': 1.3902702702702703}, 'eng-hin': {'average_document_length': 106.67662682602922, 'average_query_length': 33.47783783783784, 'num_documents': 1506, 'num_queries': 925, 'average_relevant_docs_per_query': 1.8054054054054054}, 'hin-eng': {'average_document_length': 47.20783373301359, 'average_query_length': 34.98574561403509, 'num_documents': 1251, 'num_queries': 912, 'average_relevant_docs_per_query': 1.4100877192982457}, 'ita-ita': {'average_document_length': 59.778301886792455, 'average_query_length': 49.14932126696833, 'num_documents': 1272, 'num_queries': 663, 'average_relevant_docs_per_query': 1.9245852187028658}, 'eng-ita': {'average_document_length': 123.07302075326672, 'average_query_length': 49.14932126696833, 'num_documents': 1301, 'num_queries': 663, 'average_relevant_docs_per_query': 1.9849170437405732}, 'ita-eng': {'average_document_length': 59.778301886792455, 'average_query_length': 49.040723981900456, 'num_documents': 1272, 'num_queries': 663, 'average_relevant_docs_per_query': 1.9245852187028658}, 'jpn-jpn': {'average_document_length': 41.030605871330415, 'average_query_length': 23.296969696969697, 'num_documents': 1601, 'num_queries': 825, 'average_relevant_docs_per_query': 1.9406060606060607}, 'eng-jpn': {'average_document_length': 126.2647564469914, 'average_query_length': 23.296969696969697, 'num_documents': 1745, 'num_queries': 825, 'average_relevant_docs_per_query': 2.1187878787878787}, 'jpn-eng': {'average_document_length': 41.030605871330415, 'average_query_length': 51.416058394160586, 'num_documents': 1601, 'num_queries': 822, 'average_relevant_docs_per_query': 1.9476885644768855}, 'kor-kor': {'average_document_length': 31.22722159730034, 'average_query_length': 21.81804281345566, 'num_documents': 889, 'num_queries': 654, 'average_relevant_docs_per_query': 1.5642201834862386}, 'eng-kor': {'average_document_length': 112.41231822070145, 'average_query_length': 21.81804281345566, 'num_documents': 1169, 'num_queries': 654, 'average_relevant_docs_per_query': 1.952599388379205}, 'kor-eng': {'average_document_length': 31.22722159730034, 'average_query_length': 43.9527687296417, 'num_documents': 889, 'num_queries': 614, 'average_relevant_docs_per_query': 1.6661237785016287}, 'pol-pol': {'average_document_length': 50.66814439518683, 'average_query_length': 53.72101910828025, 'num_documents': 1579, 'num_queries': 785, 'average_relevant_docs_per_query': 2.080254777070064}, 'eng-pol': {'average_document_length': 112.96919566457501, 'average_query_length': 53.72101910828025, 'num_documents': 1753, 'num_queries': 785, 'average_relevant_docs_per_query': 2.385987261146497}, 'pol-eng': {'average_document_length': 50.66814439518683, 'average_query_length': 54.1994851994852, 'num_documents': 1579, 'num_queries': 777, 'average_relevant_docs_per_query': 2.101673101673102}, 'por-por': {'average_document_length': 75.9845869297164, 'average_query_length': 42.58875, 'num_documents': 1622, 'num_queries': 800, 'average_relevant_docs_per_query': 2.14}, 'eng-por': {'average_document_length': 111.42525930445393, 'average_query_length': 42.58875, 'num_documents': 1639, 'num_queries': 800, 'average_relevant_docs_per_query': 2.21875}, 'por-eng': {'average_document_length': 75.9845869297164, 'average_query_length': 46.57967377666248, 'num_documents': 1622, 'num_queries': 797, 'average_relevant_docs_per_query': 2.148055207026349}, 'tam-tam': {'average_document_length': 64.89019607843137, 'average_query_length': 33.267263427109974, 'num_documents': 1275, 'num_queries': 782, 'average_relevant_docs_per_query': 1.6994884910485935}, 'eng-tam': {'average_document_length': 96.96361185983828, 'average_query_length': 33.267263427109974, 'num_documents': 1484, 'num_queries': 782, 'average_relevant_docs_per_query': 2.0255754475703327}, 'tam-eng': {'average_document_length': 64.89019607843137, 'average_query_length': 34.777633289986994, 'num_documents': 1275, 'num_queries': 769, 'average_relevant_docs_per_query': 1.728218465539662}, 'cmn-cmn': {'average_document_length': 20.958944281524925, 'average_query_length': 12.21116504854369, 'num_documents': 1705, 'num_queries': 824, 'average_relevant_docs_per_query': 2.0716019417475726}, 'eng-cmn': {'average_document_length': 108.31593874078276, 'average_query_length': 12.21116504854369, 'num_documents': 1763, 'num_queries': 824, 'average_relevant_docs_per_query': 2.2633495145631066}, 'cmn-eng': {'average_document_length': 20.958944281524925, 'average_query_length': 41.24390243902439, 'num_documents': 1705, 'num_queries': 820, 'average_relevant_docs_per_query': 2.0817073170731706}}} | -| [XQuADRetrieval](https://huggingface.co/datasets/xquad) (Mikel Artetxe, 2019) | ['arb', 'deu', 'ell', 'eng', 'hin', 'ron', 'rus', 'spa', 'tha', 'tur', 'vie', 'zho'] | Retrieval | s2p | [Web, Written] | {'test': 1190} | {'validation': {'ar': {'average_document_length': 683.4666666666667, 'average_query_length': 53.327993254637434, 'num_documents': 240, 'num_queries': 1186, 'average_relevant_docs_per_query': 1.0}, 'de': {'average_document_length': 894.0666666666667, 'average_query_length': 69.04318374259103, 'num_documents': 240, 'num_queries': 1181, 'average_relevant_docs_per_query': 1.0}, 'el': {'average_document_length': 894.3791666666667, 'average_query_length': 68.61317567567568, 'num_documents': 240, 'num_queries': 1184, 'average_relevant_docs_per_query': 1.0}, 'en': {'average_document_length': 784.8333333333334, 'average_query_length': 61.25063291139241, 'num_documents': 240, 'num_queries': 1185, 'average_relevant_docs_per_query': 1.0}, 'es': {'average_document_length': 883.8041666666667, 'average_query_length': 68.23817567567568, 'num_documents': 240, 'num_queries': 1184, 'average_relevant_docs_per_query': 1.0}, 'hi': {'average_document_length': 764.9416666666667, 'average_query_length': 59.684699915469146, 'num_documents': 240, 'num_queries': 1183, 'average_relevant_docs_per_query': 1.0}, 'ro': {'average_document_length': 878.4458333333333, 'average_query_length': 67.17229729729729, 'num_documents': 240, 'num_queries': 1184, 'average_relevant_docs_per_query': 1.0}, 'ru': {'average_document_length': 850.1875, 'average_query_length': 64.94261603375527, 'num_documents': 240, 'num_queries': 1185, 'average_relevant_docs_per_query': 1.0}, 'th': {'average_document_length': 736.7583333333333, 'average_query_length': 55.103389830508476, 'num_documents': 240, 'num_queries': 1180, 'average_relevant_docs_per_query': 1.0}, 'tr': {'average_document_length': 788.3, 'average_query_length': 60.876689189189186, 'num_documents': 240, 'num_queries': 1184, 'average_relevant_docs_per_query': 1.0}, 'vi': {'average_document_length': 803.9083333333333, 'average_query_length': 61.62859560067682, 'num_documents': 240, 'num_queries': 1182, 'average_relevant_docs_per_query': 1.0}, 'zh': {'average_document_length': 252.4, 'average_query_length': 18.460626587637595, 'num_documents': 240, 'num_queries': 1181, 'average_relevant_docs_per_query': 1.0}}} | -| [XStance](https://github.com/ZurichNLP/xstance) | ['deu', 'fra', 'ita'] | PairClassification | s2s | [Social, Written] | {'test': 2048} | {'test': 152.41} | -| [YahooAnswersTopicsClassification](https://huggingface.co/datasets/yahoo_answers_topics) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Web, Written] | {'test': 60000} | {'test': 346.35} | -| [YelpReviewFullClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Reviews, Written] | {'test': 50000} | {} | -| [YueOpenriceReviewClassification](https://github.com/Christainx/Dataset_Cantonese_Openrice) (Xiang et al., 2019) | ['yue'] | Classification | s2s | [Reviews, Spoken] | {'test': 6161} | {'test': 173.0} | -| [indonli](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) | ['ind'] | PairClassification | s2s | [Encyclopaedic, Web, News, Written] | {'test_expert': 2040} | {'test_expert': 145.88} | -| [mFollowIRCrossLingualInstructionRetrieval](https://neuclir.github.io/) (Weller et al., 2024) | ['eng', 'fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'eng-fas': 80, 'eng-rus': 80, 'eng-zho': 86} | {'test': {'num_docs': 121635, 'num_queries': 123, 'average_document_length': 2331.0777818884367, 'average_query_length': 81.8780487804878, 'average_instruction_length': 389.9512195121951, 'average_changed_instruction_length': 450.5528455284553, 'average_relevant_docs_per_query': 10.30952380952381, 'average_top_ranked_per_query': 1024.3902439024391, 'hf_subset_descriptive_stats': {'eng-fas': {'num_docs': 41189, 'num_queries': 40, 'average_document_length': 3145.4990895627475, 'average_query_length': 80.075, 'average_instruction_length': 396.875, 'average_changed_instruction_length': 463.175, 'average_relevant_docs_per_query': 10.465116279069768, 'average_top_ranked_per_query': 1075}, 'eng-rus': {'num_docs': 39326, 'num_queries': 40, 'average_document_length': 2784.0813456746173, 'average_query_length': 81.875, 'average_instruction_length': 371.125, 'average_changed_instruction_length': 431.8, 'average_relevant_docs_per_query': 9.775, 'average_top_ranked_per_query': 1000}, 'eng-zho': {'num_docs': 41120, 'num_queries': 43, 'average_document_length': 1082.0501215953307, 'average_query_length': 83.55813953488372, 'average_instruction_length': 401.0232558139535, 'average_changed_instruction_length': 456.25581395348837, 'average_relevant_docs_per_query': 10.651162790697674, 'average_top_ranked_per_query': 1000}}}} | -| [mFollowIRInstructionRetrieval](https://neuclir.github.io/) (Weller et al., 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'fas': 80, 'rus': 80, 'zho': 86} | {'test': {'num_docs': 121635, 'num_queries': 123, 'average_document_length': 2331.0777818884367, 'average_query_length': 57.113821138211385, 'average_instruction_length': 281.0650406504065, 'average_changed_instruction_length': 326.9430894308943, 'average_relevant_docs_per_query': 10.30952380952381, 'average_top_ranked_per_query': 1024.3902439024391, 'hf_subset_descriptive_stats': {'fas': {'num_docs': 41189, 'num_queries': 40, 'average_document_length': 3145.4990895627475, 'average_query_length': 72.65, 'average_instruction_length': 358.925, 'average_changed_instruction_length': 415.325, 'average_relevant_docs_per_query': 10.465116279069768, 'average_top_ranked_per_query': 1075}, 'rus': {'num_docs': 39326, 'num_queries': 40, 'average_document_length': 2784.0813456746173, 'average_query_length': 77.5, 'average_instruction_length': 387, 'average_changed_instruction_length': 458, 'average_relevant_docs_per_query': 9.775, 'average_top_ranked_per_query': 1000}, 'zho': {'num_docs': 41120, 'num_queries': 43, 'average_document_length': 1082.0501215953307, 'average_query_length': 23.697674418604652, 'average_instruction_length': 110.09302325581395, 'average_changed_instruction_length': 122.81395348837209, 'average_relevant_docs_per_query': 10.651162790697674, 'average_top_ranked_per_query': 1000}}}} | -======= | [WikiClusteringP2P.v2](https://github.com/Rysias/wiki-clustering) | ['bos', 'cat', 'ces', 'dan', 'eus', 'glv', 'ilo', 'kur', 'lav', 'min', 'mlt', 'sco', 'sqi', 'wln'] | Clustering | p2p | [Encyclopaedic, Written] | None | None | +| [WikipediaBioMetChemClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaBiolumNeurochemClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaChemEngSpecialtiesClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaChemFieldsClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaChemistryTopicsClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaChemistryTopicsClustering](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Clustering | s2p | [Chemistry] | None | None | +| [WikipediaCompChemSpectroscopyClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaCryobiologySeparationClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaCrystallographyAnalyticalClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaGreenhouseEnantiopureClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaIsotopesFissionClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaLuminescenceClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaOrganicInorganicClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | | [WikipediaRerankingMultilingual](https://huggingface.co/datasets/ellamind/wikipedia-2023-11-reranking-multilingual) | ['ben', 'bul', 'ces', 'dan', 'deu', 'eng', 'fas', 'fin', 'hin', 'ita', 'nld', 'nor', 'por', 'ron', 'srp', 'swe'] | Reranking | s2p | [Encyclopaedic, Written] | {'test': 24000} | {'test': {'num_samples': 24000, 'number_of_characters': 83866932, 'num_positive': 24000, 'num_negative': 192000, 'min_query_length': 7, 'avg_query_length': 59.09, 'max_query_length': 180, 'unique_query': 23997, 'min_positive_length': 100, 'avg_positive_length': 385.45, 'max_positive_length': 3515, 'unique_positive': 23993, 'min_negative_length': 100, 'avg_negative_length': 381.24, 'max_negative_length': 9461, 'unique_negative': 191783, 'hf_subset_descriptive_stats': {'bg': {'num_samples': 1500, 'number_of_characters': 5145316, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 18, 'avg_query_length': 60.83, 'max_query_length': 166, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 375.89, 'max_positive_length': 2241, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 374.19, 'max_negative_length': 4869, 'unique_negative': 11996}, 'bn': {'num_samples': 1500, 'number_of_characters': 5390581, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 7, 'avg_query_length': 47.27, 'max_query_length': 123, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 394.59, 'max_positive_length': 2338, 'unique_positive': 1499, 'min_negative_length': 100, 'avg_negative_length': 393.98, 'max_negative_length': 5104, 'unique_negative': 11996}, 'cs': {'num_samples': 1500, 'number_of_characters': 5079180, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 17, 'avg_query_length': 56.27, 'max_query_length': 137, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 383.84, 'max_positive_length': 2300, 'unique_positive': 1499, 'min_negative_length': 100, 'avg_negative_length': 368.25, 'max_negative_length': 3487, 'unique_negative': 11982}, 'da': {'num_samples': 1500, 'number_of_characters': 4746132, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 17, 'avg_query_length': 56.75, 'max_query_length': 137, 'unique_query': 1499, 'min_positive_length': 100, 'avg_positive_length': 351.68, 'max_positive_length': 2159, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 344.46, 'max_negative_length': 2563, 'unique_negative': 11972}, 'de': {'num_samples': 1500, 'number_of_characters': 5483592, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 20, 'avg_query_length': 70.0, 'max_query_length': 180, 'unique_query': 1499, 'min_positive_length': 100, 'avg_positive_length': 391.54, 'max_positive_length': 2674, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 399.27, 'max_negative_length': 3083, 'unique_negative': 12000}, 'en': {'num_samples': 1500, 'number_of_characters': 6217884, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 18, 'avg_query_length': 68.37, 'max_query_length': 162, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 451.73, 'max_positive_length': 3515, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 453.14, 'max_negative_length': 3662, 'unique_negative': 12000}, 'fa': {'num_samples': 1500, 'number_of_characters': 4732619, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 12, 'avg_query_length': 48.67, 'max_query_length': 119, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 347.7, 'max_positive_length': 2571, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 344.84, 'max_negative_length': 4707, 'unique_negative': 11978}, 'fi': {'num_samples': 1500, 'number_of_characters': 5209132, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 14, 'avg_query_length': 55.34, 'max_query_length': 132, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 394.71, 'max_positive_length': 2129, 'unique_positive': 1498, 'min_negative_length': 100, 'avg_negative_length': 377.84, 'max_negative_length': 2574, 'unique_negative': 11972}, 'hi': {'num_samples': 1500, 'number_of_characters': 5620959, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 13, 'avg_query_length': 50.78, 'max_query_length': 125, 'unique_query': 1499, 'min_positive_length': 100, 'avg_positive_length': 420.38, 'max_positive_length': 2361, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 409.52, 'max_negative_length': 5912, 'unique_negative': 11996}, 'it': {'num_samples': 1500, 'number_of_characters': 5420496, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 23, 'avg_query_length': 70.05, 'max_query_length': 156, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 396.97, 'max_positive_length': 2082, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 393.33, 'max_negative_length': 9461, 'unique_negative': 11993}, 'nl': {'num_samples': 1500, 'number_of_characters': 5169556, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 18, 'avg_query_length': 65.34, 'max_query_length': 136, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 380.79, 'max_positive_length': 1864, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 375.03, 'max_negative_length': 3641, 'unique_negative': 11985}, 'pt': {'num_samples': 1500, 'number_of_characters': 5474356, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 18, 'avg_query_length': 65.12, 'max_query_length': 176, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 404.02, 'max_positive_length': 3057, 'unique_positive': 1499, 'min_negative_length': 100, 'avg_negative_length': 397.55, 'max_negative_length': 2877, 'unique_negative': 11991}, 'ro': {'num_samples': 1500, 'number_of_characters': 4796113, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 14, 'avg_query_length': 61.97, 'max_query_length': 169, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 346.71, 'max_positive_length': 1917, 'unique_positive': 1499, 'min_negative_length': 100, 'avg_negative_length': 348.59, 'max_negative_length': 4213, 'unique_negative': 11971}, 'sr': {'num_samples': 1500, 'number_of_characters': 5271732, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 15, 'avg_query_length': 55.67, 'max_query_length': 146, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 386.35, 'max_positive_length': 2421, 'unique_positive': 1499, 'min_negative_length': 100, 'avg_negative_length': 384.06, 'max_negative_length': 3668, 'unique_negative': 11974}, 'no': {'num_samples': 1500, 'number_of_characters': 5036586, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 14, 'avg_query_length': 55.29, 'max_query_length': 129, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 367.72, 'max_positive_length': 1450, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 366.84, 'max_negative_length': 2841, 'unique_negative': 11996}, 'sv': {'num_samples': 1500, 'number_of_characters': 5072698, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 17, 'avg_query_length': 57.73, 'max_query_length': 133, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 372.59, 'max_positive_length': 2493, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 368.94, 'max_negative_length': 3680, 'unique_negative': 11999}}}} | | [WikipediaRetrievalMultilingual](https://huggingface.co/datasets/ellamind/wikipedia-2023-11-retrieval-multilingual-queries) | ['ben', 'bul', 'ces', 'dan', 'deu', 'eng', 'fas', 'fin', 'hin', 'ita', 'nld', 'nor', 'por', 'ron', 'srp', 'swe'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [WikipediaSaltsSemiconductorsClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaSolidStateColloidalClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaSpecialtiesInChemistryClustering](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Clustering | s2p | [Chemistry] | None | None | +| [WikipediaTheoreticalAppliedClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | | [WinoGrande](https://winogrande.allenai.org/) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | -| [WisesightSentimentClassification](https://github.com/PyThaiNLP/wisesight-sentiment) | ['tha'] | Classification | s2s | [Social, News, Written] | None | None | +| [Winoground](https://openaccess.thecvf.com/content/CVPR2022/html/Thrush_Winoground_Probing_Vision_and_Language_Models_for_Visio-Linguistic_Compositionality_CVPR_2022_paper) (Tristan Thrush, 2022) | ['eng'] | ImageTextPairClassification | i2t | [Social] | None | None | +| [WisesightSentimentClassification](https://github.com/PyThaiNLP/wisesight-sentiment) | ['tha'] | Classification | s2s | [News, Social, Written] | None | None | +| [XFlickr30kCoT2IRetrieval](https://proceedings.mlr.press/v162/bugliarello22a/bugliarello22a.pdf) (Bugliarello et al., 2022) | ['deu', 'eng', 'ind', 'jpn', 'rus', 'spa', 'tur', 'zho'] | Any2AnyRetrieval | t2i | [Encyclopaedic, Written] | None | None | +| [XM3600T2IRetrieval](https://aclanthology.org/2022.emnlp-main.45/) (Thapliyal et al., 2022) | ['ara', 'ben', 'ces', 'dan', 'deu', 'ell', 'eng', 'fas', 'fil', 'fin', 'fra', 'heb', 'hin', 'hrv', 'hun', 'ind', 'ita', 'jpn', 'kor', 'mri', 'nld', 'nor', 'pol', 'por', 'quz', 'ron', 'rus', 'spa', 'swa', 'swe', 'tel', 'tha', 'tur', 'ukr', 'vie', 'zho'] | Any2AnyRetrieval | t2i | [Encyclopaedic, Written] | None | None | | XMarket (Bonab et al., 2021) | ['deu', 'eng', 'spa'] | Retrieval | s2p | | None | None | -| [XNLI](https://aclanthology.org/D18-1269/) (Conneau et al., 2018) | ['ara', 'bul', 'deu', 'ell', 'eng', 'fra', 'hin', 'rus', 'spa', 'swa', 'tha', 'tur', 'vie', 'zho'] | PairClassification | s2s | [Non-fiction, Fiction, Government, Written] | {'test': 19110, 'validation': 19110} | {'test': {'num_samples': 19110, 'number_of_characters': 2907145, 'min_sentence1_length': 3, 'avg_sentence1_length': 103.24, 'max_sentence1_length': 401, 'unique_sentence1': 15328, 'min_sentence2_length': 2, 'avg_sentence2_length': 48.89, 'max_sentence2_length': 187, 'unique_sentence2': 19104, 'unique_labels': 2, 'labels': {'0': {'count': 9562}, '1': {'count': 9548}}, 'hf_subset_descriptive_stats': {'ar': {'num_samples': 1365, 'number_of_characters': 179591, 'min_sentence1_length': 11, 'avg_sentence1_length': 89.57, 'max_sentence1_length': 242, 'unique_sentence1': 1095, 'min_sentence2_length': 8, 'avg_sentence2_length': 41.99, 'max_sentence2_length': 115, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'bg': {'num_samples': 1365, 'number_of_characters': 220646, 'min_sentence1_length': 14, 'avg_sentence1_length': 110.02, 'max_sentence1_length': 303, 'unique_sentence1': 1095, 'min_sentence2_length': 8, 'avg_sentence2_length': 51.63, 'max_sentence2_length': 150, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'de': {'num_samples': 1365, 'number_of_characters': 241224, 'min_sentence1_length': 3, 'avg_sentence1_length': 119.93, 'max_sentence1_length': 301, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 56.79, 'max_sentence2_length': 187, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'el': {'num_samples': 1365, 'number_of_characters': 240222, 'min_sentence1_length': 13, 'avg_sentence1_length': 119.05, 'max_sentence1_length': 344, 'unique_sentence1': 1095, 'min_sentence2_length': 13, 'avg_sentence2_length': 56.93, 'max_sentence2_length': 172, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'en': {'num_samples': 1365, 'number_of_characters': 212223, 'min_sentence1_length': 19, 'avg_sentence1_length': 105.67, 'max_sentence1_length': 268, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 49.8, 'max_sentence2_length': 137, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'es': {'num_samples': 1365, 'number_of_characters': 232207, 'min_sentence1_length': 11, 'avg_sentence1_length': 115.43, 'max_sentence1_length': 385, 'unique_sentence1': 1094, 'min_sentence2_length': 8, 'avg_sentence2_length': 54.68, 'max_sentence2_length': 163, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'fr': {'num_samples': 1365, 'number_of_characters': 245259, 'min_sentence1_length': 9, 'avg_sentence1_length': 121.1, 'max_sentence1_length': 327, 'unique_sentence1': 1095, 'min_sentence2_length': 10, 'avg_sentence2_length': 58.58, 'max_sentence2_length': 169, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'hi': {'num_samples': 1365, 'number_of_characters': 211312, 'min_sentence1_length': 16, 'avg_sentence1_length': 104.63, 'max_sentence1_length': 401, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 50.17, 'max_sentence2_length': 162, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'ru': {'num_samples': 1365, 'number_of_characters': 222797, 'min_sentence1_length': 11, 'avg_sentence1_length': 110.77, 'max_sentence1_length': 306, 'unique_sentence1': 1095, 'min_sentence2_length': 8, 'avg_sentence2_length': 52.45, 'max_sentence2_length': 167, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'sw': {'num_samples': 1365, 'number_of_characters': 210103, 'min_sentence1_length': 10, 'avg_sentence1_length': 104.44, 'max_sentence1_length': 266, 'unique_sentence1': 1094, 'min_sentence2_length': 2, 'avg_sentence2_length': 49.48, 'max_sentence2_length': 146, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'th': {'num_samples': 1365, 'number_of_characters': 192788, 'min_sentence1_length': 12, 'avg_sentence1_length': 96.69, 'max_sentence1_length': 262, 'unique_sentence1': 1095, 'min_sentence2_length': 6, 'avg_sentence2_length': 44.54, 'max_sentence2_length': 129, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'tr': {'num_samples': 1365, 'number_of_characters': 208658, 'min_sentence1_length': 15, 'avg_sentence1_length': 103.68, 'max_sentence1_length': 255, 'unique_sentence1': 1095, 'min_sentence2_length': 6, 'avg_sentence2_length': 49.19, 'max_sentence2_length': 140, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'vi': {'num_samples': 1365, 'number_of_characters': 223549, 'min_sentence1_length': 14, 'avg_sentence1_length': 111.31, 'max_sentence1_length': 265, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 52.46, 'max_sentence2_length': 143, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'zh': {'num_samples': 1365, 'number_of_characters': 66566, 'min_sentence1_length': 4, 'avg_sentence1_length': 33.04, 'max_sentence1_length': 112, 'unique_sentence1': 1095, 'min_sentence2_length': 3, 'avg_sentence2_length': 15.73, 'max_sentence2_length': 59, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}}}, 'validation': {'num_samples': 19110, 'number_of_characters': 2909058, 'min_sentence1_length': 5, 'avg_sentence1_length': 103.21, 'max_sentence1_length': 323, 'unique_sentence1': 11171, 'min_sentence2_length': 3, 'avg_sentence2_length': 49.02, 'max_sentence2_length': 172, 'unique_sentence2': 19101, 'unique_labels': 2, 'labels': {'0': {'count': 9562}, '1': {'count': 9548}}, 'hf_subset_descriptive_stats': {'ar': {'num_samples': 1365, 'number_of_characters': 177355, 'min_sentence1_length': 13, 'avg_sentence1_length': 88.32, 'max_sentence1_length': 214, 'unique_sentence1': 798, 'min_sentence2_length': 6, 'avg_sentence2_length': 41.61, 'max_sentence2_length': 137, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'bg': {'num_samples': 1365, 'number_of_characters': 219988, 'min_sentence1_length': 16, 'avg_sentence1_length': 109.2, 'max_sentence1_length': 316, 'unique_sentence1': 798, 'min_sentence2_length': 10, 'avg_sentence2_length': 51.97, 'max_sentence2_length': 151, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'de': {'num_samples': 1365, 'number_of_characters': 241852, 'min_sentence1_length': 20, 'avg_sentence1_length': 119.81, 'max_sentence1_length': 298, 'unique_sentence1': 798, 'min_sentence2_length': 12, 'avg_sentence2_length': 57.37, 'max_sentence2_length': 162, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'el': {'num_samples': 1365, 'number_of_characters': 241275, 'min_sentence1_length': 16, 'avg_sentence1_length': 119.88, 'max_sentence1_length': 302, 'unique_sentence1': 798, 'min_sentence2_length': 6, 'avg_sentence2_length': 56.88, 'max_sentence2_length': 171, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'en': {'num_samples': 1365, 'number_of_characters': 212384, 'min_sentence1_length': 20, 'avg_sentence1_length': 105.72, 'max_sentence1_length': 271, 'unique_sentence1': 798, 'min_sentence2_length': 8, 'avg_sentence2_length': 49.88, 'max_sentence2_length': 139, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'es': {'num_samples': 1365, 'number_of_characters': 232451, 'min_sentence1_length': 14, 'avg_sentence1_length': 115.17, 'max_sentence1_length': 265, 'unique_sentence1': 798, 'min_sentence2_length': 7, 'avg_sentence2_length': 55.12, 'max_sentence2_length': 148, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'fr': {'num_samples': 1365, 'number_of_characters': 246857, 'min_sentence1_length': 19, 'avg_sentence1_length': 121.76, 'max_sentence1_length': 323, 'unique_sentence1': 798, 'min_sentence2_length': 11, 'avg_sentence2_length': 59.09, 'max_sentence2_length': 172, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'hi': {'num_samples': 1365, 'number_of_characters': 212269, 'min_sentence1_length': 18, 'avg_sentence1_length': 105.06, 'max_sentence1_length': 277, 'unique_sentence1': 798, 'min_sentence2_length': 7, 'avg_sentence2_length': 50.44, 'max_sentence2_length': 152, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'ru': {'num_samples': 1365, 'number_of_characters': 221152, 'min_sentence1_length': 15, 'avg_sentence1_length': 109.75, 'max_sentence1_length': 310, 'unique_sentence1': 798, 'min_sentence2_length': 8, 'avg_sentence2_length': 52.27, 'max_sentence2_length': 140, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'sw': {'num_samples': 1365, 'number_of_characters': 210482, 'min_sentence1_length': 13, 'avg_sentence1_length': 104.32, 'max_sentence1_length': 264, 'unique_sentence1': 798, 'min_sentence2_length': 8, 'avg_sentence2_length': 49.88, 'max_sentence2_length': 153, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'th': {'num_samples': 1365, 'number_of_characters': 192640, 'min_sentence1_length': 7, 'avg_sentence1_length': 97.28, 'max_sentence1_length': 255, 'unique_sentence1': 798, 'min_sentence2_length': 3, 'avg_sentence2_length': 43.84, 'max_sentence2_length': 140, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'tr': {'num_samples': 1365, 'number_of_characters': 208305, 'min_sentence1_length': 15, 'avg_sentence1_length': 102.97, 'max_sentence1_length': 269, 'unique_sentence1': 798, 'min_sentence2_length': 10, 'avg_sentence2_length': 49.64, 'max_sentence2_length': 139, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'vi': {'num_samples': 1365, 'number_of_characters': 224811, 'min_sentence1_length': 18, 'avg_sentence1_length': 112.26, 'max_sentence1_length': 323, 'unique_sentence1': 798, 'min_sentence2_length': 9, 'avg_sentence2_length': 52.43, 'max_sentence2_length': 159, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'zh': {'num_samples': 1365, 'number_of_characters': 67237, 'min_sentence1_length': 5, 'avg_sentence1_length': 33.41, 'max_sentence1_length': 135, 'unique_sentence1': 798, 'min_sentence2_length': 3, 'avg_sentence2_length': 15.85, 'max_sentence2_length': 66, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}}}} | -| [XNLIV2](https://arxiv.org/pdf/2301.06527) (Upadhyay et al., 2023) | ['asm', 'ben', 'bho', 'ell', 'guj', 'kan', 'mar', 'ory', 'pan', 'rus', 'san', 'tam', 'tur'] | PairClassification | s2s | [Non-fiction, Fiction, Government, Written] | None | None | +| [XNLI](https://aclanthology.org/D18-1269/) (Conneau et al., 2018) | ['ara', 'bul', 'deu', 'ell', 'eng', 'fra', 'hin', 'rus', 'spa', 'swa', 'tha', 'tur', 'vie', 'zho'] | PairClassification | s2s | [Fiction, Government, Non-fiction, Written] | {'test': 19110, 'validation': 19110} | {'test': {'num_samples': 19110, 'number_of_characters': 2907145, 'min_sentence1_length': 3, 'avg_sentence1_length': 103.24, 'max_sentence1_length': 401, 'unique_sentence1': 15328, 'min_sentence2_length': 2, 'avg_sentence2_length': 48.89, 'max_sentence2_length': 187, 'unique_sentence2': 19104, 'unique_labels': 2, 'labels': {'0': {'count': 9562}, '1': {'count': 9548}}, 'hf_subset_descriptive_stats': {'ar': {'num_samples': 1365, 'number_of_characters': 179591, 'min_sentence1_length': 11, 'avg_sentence1_length': 89.57, 'max_sentence1_length': 242, 'unique_sentence1': 1095, 'min_sentence2_length': 8, 'avg_sentence2_length': 41.99, 'max_sentence2_length': 115, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'bg': {'num_samples': 1365, 'number_of_characters': 220646, 'min_sentence1_length': 14, 'avg_sentence1_length': 110.02, 'max_sentence1_length': 303, 'unique_sentence1': 1095, 'min_sentence2_length': 8, 'avg_sentence2_length': 51.63, 'max_sentence2_length': 150, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'de': {'num_samples': 1365, 'number_of_characters': 241224, 'min_sentence1_length': 3, 'avg_sentence1_length': 119.93, 'max_sentence1_length': 301, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 56.79, 'max_sentence2_length': 187, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'el': {'num_samples': 1365, 'number_of_characters': 240222, 'min_sentence1_length': 13, 'avg_sentence1_length': 119.05, 'max_sentence1_length': 344, 'unique_sentence1': 1095, 'min_sentence2_length': 13, 'avg_sentence2_length': 56.93, 'max_sentence2_length': 172, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'en': {'num_samples': 1365, 'number_of_characters': 212223, 'min_sentence1_length': 19, 'avg_sentence1_length': 105.67, 'max_sentence1_length': 268, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 49.8, 'max_sentence2_length': 137, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'es': {'num_samples': 1365, 'number_of_characters': 232207, 'min_sentence1_length': 11, 'avg_sentence1_length': 115.43, 'max_sentence1_length': 385, 'unique_sentence1': 1094, 'min_sentence2_length': 8, 'avg_sentence2_length': 54.68, 'max_sentence2_length': 163, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'fr': {'num_samples': 1365, 'number_of_characters': 245259, 'min_sentence1_length': 9, 'avg_sentence1_length': 121.1, 'max_sentence1_length': 327, 'unique_sentence1': 1095, 'min_sentence2_length': 10, 'avg_sentence2_length': 58.58, 'max_sentence2_length': 169, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'hi': {'num_samples': 1365, 'number_of_characters': 211312, 'min_sentence1_length': 16, 'avg_sentence1_length': 104.63, 'max_sentence1_length': 401, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 50.17, 'max_sentence2_length': 162, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'ru': {'num_samples': 1365, 'number_of_characters': 222797, 'min_sentence1_length': 11, 'avg_sentence1_length': 110.77, 'max_sentence1_length': 306, 'unique_sentence1': 1095, 'min_sentence2_length': 8, 'avg_sentence2_length': 52.45, 'max_sentence2_length': 167, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'sw': {'num_samples': 1365, 'number_of_characters': 210103, 'min_sentence1_length': 10, 'avg_sentence1_length': 104.44, 'max_sentence1_length': 266, 'unique_sentence1': 1094, 'min_sentence2_length': 2, 'avg_sentence2_length': 49.48, 'max_sentence2_length': 146, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'th': {'num_samples': 1365, 'number_of_characters': 192788, 'min_sentence1_length': 12, 'avg_sentence1_length': 96.69, 'max_sentence1_length': 262, 'unique_sentence1': 1095, 'min_sentence2_length': 6, 'avg_sentence2_length': 44.54, 'max_sentence2_length': 129, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'tr': {'num_samples': 1365, 'number_of_characters': 208658, 'min_sentence1_length': 15, 'avg_sentence1_length': 103.68, 'max_sentence1_length': 255, 'unique_sentence1': 1095, 'min_sentence2_length': 6, 'avg_sentence2_length': 49.19, 'max_sentence2_length': 140, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'vi': {'num_samples': 1365, 'number_of_characters': 223549, 'min_sentence1_length': 14, 'avg_sentence1_length': 111.31, 'max_sentence1_length': 265, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 52.46, 'max_sentence2_length': 143, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'zh': {'num_samples': 1365, 'number_of_characters': 66566, 'min_sentence1_length': 4, 'avg_sentence1_length': 33.04, 'max_sentence1_length': 112, 'unique_sentence1': 1095, 'min_sentence2_length': 3, 'avg_sentence2_length': 15.73, 'max_sentence2_length': 59, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}}}, 'validation': {'num_samples': 19110, 'number_of_characters': 2909058, 'min_sentence1_length': 5, 'avg_sentence1_length': 103.21, 'max_sentence1_length': 323, 'unique_sentence1': 11171, 'min_sentence2_length': 3, 'avg_sentence2_length': 49.02, 'max_sentence2_length': 172, 'unique_sentence2': 19101, 'unique_labels': 2, 'labels': {'0': {'count': 9562}, '1': {'count': 9548}}, 'hf_subset_descriptive_stats': {'ar': {'num_samples': 1365, 'number_of_characters': 177355, 'min_sentence1_length': 13, 'avg_sentence1_length': 88.32, 'max_sentence1_length': 214, 'unique_sentence1': 798, 'min_sentence2_length': 6, 'avg_sentence2_length': 41.61, 'max_sentence2_length': 137, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'bg': {'num_samples': 1365, 'number_of_characters': 219988, 'min_sentence1_length': 16, 'avg_sentence1_length': 109.2, 'max_sentence1_length': 316, 'unique_sentence1': 798, 'min_sentence2_length': 10, 'avg_sentence2_length': 51.97, 'max_sentence2_length': 151, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'de': {'num_samples': 1365, 'number_of_characters': 241852, 'min_sentence1_length': 20, 'avg_sentence1_length': 119.81, 'max_sentence1_length': 298, 'unique_sentence1': 798, 'min_sentence2_length': 12, 'avg_sentence2_length': 57.37, 'max_sentence2_length': 162, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'el': {'num_samples': 1365, 'number_of_characters': 241275, 'min_sentence1_length': 16, 'avg_sentence1_length': 119.88, 'max_sentence1_length': 302, 'unique_sentence1': 798, 'min_sentence2_length': 6, 'avg_sentence2_length': 56.88, 'max_sentence2_length': 171, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'en': {'num_samples': 1365, 'number_of_characters': 212384, 'min_sentence1_length': 20, 'avg_sentence1_length': 105.72, 'max_sentence1_length': 271, 'unique_sentence1': 798, 'min_sentence2_length': 8, 'avg_sentence2_length': 49.88, 'max_sentence2_length': 139, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'es': {'num_samples': 1365, 'number_of_characters': 232451, 'min_sentence1_length': 14, 'avg_sentence1_length': 115.17, 'max_sentence1_length': 265, 'unique_sentence1': 798, 'min_sentence2_length': 7, 'avg_sentence2_length': 55.12, 'max_sentence2_length': 148, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'fr': {'num_samples': 1365, 'number_of_characters': 246857, 'min_sentence1_length': 19, 'avg_sentence1_length': 121.76, 'max_sentence1_length': 323, 'unique_sentence1': 798, 'min_sentence2_length': 11, 'avg_sentence2_length': 59.09, 'max_sentence2_length': 172, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'hi': {'num_samples': 1365, 'number_of_characters': 212269, 'min_sentence1_length': 18, 'avg_sentence1_length': 105.06, 'max_sentence1_length': 277, 'unique_sentence1': 798, 'min_sentence2_length': 7, 'avg_sentence2_length': 50.44, 'max_sentence2_length': 152, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'ru': {'num_samples': 1365, 'number_of_characters': 221152, 'min_sentence1_length': 15, 'avg_sentence1_length': 109.75, 'max_sentence1_length': 310, 'unique_sentence1': 798, 'min_sentence2_length': 8, 'avg_sentence2_length': 52.27, 'max_sentence2_length': 140, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'sw': {'num_samples': 1365, 'number_of_characters': 210482, 'min_sentence1_length': 13, 'avg_sentence1_length': 104.32, 'max_sentence1_length': 264, 'unique_sentence1': 798, 'min_sentence2_length': 8, 'avg_sentence2_length': 49.88, 'max_sentence2_length': 153, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'th': {'num_samples': 1365, 'number_of_characters': 192640, 'min_sentence1_length': 7, 'avg_sentence1_length': 97.28, 'max_sentence1_length': 255, 'unique_sentence1': 798, 'min_sentence2_length': 3, 'avg_sentence2_length': 43.84, 'max_sentence2_length': 140, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'tr': {'num_samples': 1365, 'number_of_characters': 208305, 'min_sentence1_length': 15, 'avg_sentence1_length': 102.97, 'max_sentence1_length': 269, 'unique_sentence1': 798, 'min_sentence2_length': 10, 'avg_sentence2_length': 49.64, 'max_sentence2_length': 139, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'vi': {'num_samples': 1365, 'number_of_characters': 224811, 'min_sentence1_length': 18, 'avg_sentence1_length': 112.26, 'max_sentence1_length': 323, 'unique_sentence1': 798, 'min_sentence2_length': 9, 'avg_sentence2_length': 52.43, 'max_sentence2_length': 159, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'zh': {'num_samples': 1365, 'number_of_characters': 67237, 'min_sentence1_length': 5, 'avg_sentence1_length': 33.41, 'max_sentence1_length': 135, 'unique_sentence1': 798, 'min_sentence2_length': 3, 'avg_sentence2_length': 15.85, 'max_sentence2_length': 66, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}}}} | +| [XNLIV2](https://arxiv.org/pdf/2301.06527) (Upadhyay et al., 2023) | ['asm', 'ben', 'bho', 'ell', 'guj', 'kan', 'mar', 'ory', 'pan', 'rus', 'san', 'tam', 'tur'] | PairClassification | s2s | [Fiction, Government, Non-fiction, Written] | None | None | | [XPQARetrieval](https://arxiv.org/abs/2305.09249) (Shen et al., 2023) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'jpn', 'kor', 'pol', 'por', 'spa', 'tam'] | Retrieval | s2p | [Reviews, Written] | None | None | | [XQuADRetrieval](https://huggingface.co/datasets/xquad) (Mikel Artetxe, 2019) | ['arb', 'deu', 'ell', 'eng', 'hin', 'ron', 'rus', 'spa', 'tha', 'tur', 'vie', 'zho'] | Retrieval | s2p | [Web, Written] | None | None | | [XStance](https://github.com/ZurichNLP/xstance) | ['deu', 'fra', 'ita'] | PairClassification | s2s | [Social, Written] | None | None | | [YahooAnswersTopicsClassification](https://huggingface.co/datasets/yahoo_answers_topics) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Web, Written] | None | None | | [YelpReviewFullClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Reviews, Written] | None | None | | [YueOpenriceReviewClassification](https://github.com/Christainx/Dataset_Cantonese_Openrice) (Xiang et al., 2019) | ['yue'] | Classification | s2s | [Reviews, Spoken] | None | None | -| [indonli](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) | ['ind'] | PairClassification | s2s | [Encyclopaedic, Web, News, Written] | None | None | +| [indonli](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) | ['ind'] | PairClassification | s2s | [Encyclopaedic, News, Web, Written] | None | None | | [mFollowIRCrossLingualInstructionRetrieval](https://neuclir.github.io/) (Weller et al., 2024) | ['eng', 'fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'test': 121758} | {'test': {'num_samples': 121758, 'num_docs': 121635, 'num_queries': 123, 'number_of_characters': 283654099, 'min_document_length': 74, 'average_document_length': 2331.08, 'max_document_length': 24179, 'unique_docs': 121635, 'min_query_length': 32, 'average_query_length': 81.88, 'max_query_length': 173, 'unique_queries': 75, 'min_instruction_length': 93, 'average_instruction_length': 389.95, 'max_instruction_length': 887, 'unique_instructions': 75, 'min_changed_instruction_length': 180, 'average_changed_instruction_length': 450.55, 'max_changed_instruction_length': 974, 'unique_changed_instructions': 123, 'min_average_relevant_docs_per_query': 0, 'average_relevant_docs_per_query': 10.43, 'max_average_relevant_docs_per_query': 24, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000, 'hf_subset_descriptive_stats': {'eng-fas': {'num_samples': 41229, 'num_docs': 41189, 'num_queries': 40, 'number_of_characters': 129597567, 'min_document_length': 99, 'average_document_length': 3145.5, 'max_document_length': 24179, 'unique_docs': 41189, 'min_query_length': 34, 'average_query_length': 80.08, 'max_query_length': 124, 'unique_queries': 40, 'min_instruction_length': 150, 'average_instruction_length': 396.88, 'max_instruction_length': 887, 'unique_instructions': 40, 'min_changed_instruction_length': 205, 'average_changed_instruction_length': 463.18, 'max_changed_instruction_length': 974, 'unique_changed_instructions': 40, 'min_average_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 10.85, 'max_average_relevant_docs_per_query': 22, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}, 'eng-rus': {'num_samples': 39366, 'num_docs': 39326, 'num_queries': 40, 'number_of_characters': 109522175, 'min_document_length': 75, 'average_document_length': 2784.08, 'max_document_length': 24061, 'unique_docs': 39326, 'min_query_length': 32, 'average_query_length': 81.88, 'max_query_length': 173, 'unique_queries': 40, 'min_instruction_length': 93, 'average_instruction_length': 371.12, 'max_instruction_length': 887, 'unique_instructions': 40, 'min_changed_instruction_length': 180, 'average_changed_instruction_length': 431.8, 'max_changed_instruction_length': 957, 'unique_changed_instructions': 40, 'min_average_relevant_docs_per_query': 0, 'average_relevant_docs_per_query': 9.78, 'max_average_relevant_docs_per_query': 24, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}, 'eng-zho': {'num_samples': 41163, 'num_docs': 41120, 'num_queries': 43, 'number_of_characters': 44534357, 'min_document_length': 74, 'average_document_length': 1082.05, 'max_document_length': 23840, 'unique_docs': 41120, 'min_query_length': 32, 'average_query_length': 83.56, 'max_query_length': 159, 'unique_queries': 43, 'min_instruction_length': 157, 'average_instruction_length': 401.02, 'max_instruction_length': 731, 'unique_instructions': 43, 'min_changed_instruction_length': 209, 'average_changed_instruction_length': 456.26, 'max_changed_instruction_length': 822, 'unique_changed_instructions': 43, 'min_average_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 10.65, 'max_average_relevant_docs_per_query': 24, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}}}} | | [mFollowIRInstructionRetrieval](https://neuclir.github.io/) (Weller et al., 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'test': 121758} | {'test': {'num_samples': 121758, 'num_docs': 121635, 'num_queries': 123, 'number_of_characters': 283622456, 'min_document_length': 74, 'average_document_length': 2331.08, 'max_document_length': 24179, 'unique_docs': 121635, 'min_query_length': 10, 'average_query_length': 57.11, 'max_query_length': 136, 'unique_queries': 123, 'min_instruction_length': 37, 'average_instruction_length': 281.07, 'max_instruction_length': 1009, 'unique_instructions': 123, 'min_changed_instruction_length': 44, 'average_changed_instruction_length': 326.94, 'max_changed_instruction_length': 1083, 'unique_changed_instructions': 123, 'min_average_relevant_docs_per_query': 0, 'average_relevant_docs_per_query': 10.43, 'max_average_relevant_docs_per_query': 24, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000, 'hf_subset_descriptive_stats': {'fas': {'num_samples': 41229, 'num_docs': 41189, 'num_queries': 40, 'number_of_characters': 129593838, 'min_document_length': 99, 'average_document_length': 3145.5, 'max_document_length': 24179, 'unique_docs': 41189, 'min_query_length': 34, 'average_query_length': 72.65, 'max_query_length': 124, 'unique_queries': 40, 'min_instruction_length': 121, 'average_instruction_length': 358.93, 'max_instruction_length': 759, 'unique_instructions': 40, 'min_changed_instruction_length': 163, 'average_changed_instruction_length': 415.32, 'max_changed_instruction_length': 842, 'unique_changed_instructions': 40, 'min_average_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 10.85, 'max_average_relevant_docs_per_query': 22, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}, 'rus': {'num_samples': 39366, 'num_docs': 39326, 'num_queries': 40, 'number_of_characters': 109523683, 'min_document_length': 75, 'average_document_length': 2784.08, 'max_document_length': 24061, 'unique_docs': 39326, 'min_query_length': 26, 'average_query_length': 77.5, 'max_query_length': 136, 'unique_queries': 40, 'min_instruction_length': 78, 'average_instruction_length': 387.0, 'max_instruction_length': 1009, 'unique_instructions': 40, 'min_changed_instruction_length': 187, 'average_changed_instruction_length': 458.0, 'max_changed_instruction_length': 1083, 'unique_changed_instructions': 40, 'min_average_relevant_docs_per_query': 0, 'average_relevant_docs_per_query': 9.78, 'max_average_relevant_docs_per_query': 24, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}, 'zho': {'num_samples': 41163, 'num_docs': 41120, 'num_queries': 43, 'number_of_characters': 44504935, 'min_document_length': 74, 'average_document_length': 1082.05, 'max_document_length': 23840, 'unique_docs': 41120, 'min_query_length': 10, 'average_query_length': 23.7, 'max_query_length': 44, 'unique_queries': 43, 'min_instruction_length': 37, 'average_instruction_length': 110.09, 'max_instruction_length': 209, 'unique_instructions': 43, 'min_changed_instruction_length': 44, 'average_changed_instruction_length': 122.81, 'max_changed_instruction_length': 229, 'unique_changed_instructions': 43, 'min_average_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 10.65, 'max_average_relevant_docs_per_query': 24, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}}}} | ->>>>>>> main @@ -1113,2117 +866,1061 @@ The following tables give you an overview of the tasks in MTEB.
-<<<<<<< HEAD -| Language | BitextMining | Classification | Clustering | InstructionRetrieval | MultilabelClassification | PairClassification | Reranking | Retrieval | STS | Speed | Summarization | -|---|------|------|------|------|------|------|------|------|------|------|---| -| aai | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| aak | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| aau | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| aaz | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| abs | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| abt | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| abx | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| aby | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ace | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| acf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| acm | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| acq | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| acr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| acu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| adz | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| aeb | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| aer | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| aey | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| afr | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | -| agd | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| agg | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| agm | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| agn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| agr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| agt | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| agu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| aia | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| aii | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ajp | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| aka | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ake | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| alp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| alq | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| als | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| aly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ame | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| amf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| amh | 3 | 6 | 3 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | -| amk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| amm | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| amn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| amo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| amp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| amr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| amu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| amx | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ang | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| anh | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| anp | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| anv | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| aoi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| aoj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| aom | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| aon | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| apb | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| apc | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| ape | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| apn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| apr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| apu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| apw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| apz | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ara | 2 | 12 | 0 | 0 | 0 | 2 | 1 | 9 | 2 | 0 | 0 | -| arb | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | -| are | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| arl | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| arn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| arp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| arq | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | -| ars | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| ary | 1 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | -| arz | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| asm | 5 | 3 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | -| aso | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ast | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ata | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| atb | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| atd | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| atg | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| att | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| auc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| aui | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| auy | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| avt | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| awa | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| awb | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| awk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| awx | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ayr | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| azb | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| aze | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| azg | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| azj | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| azz | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bak | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bam | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| ban | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bao | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bba | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bbb | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bbc | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bbr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bch | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bco | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bdd | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bef | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bel | 4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bem | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ben | 7 | 9 | 2 | 0 | 0 | 1 | 2 | 6 | 1 | 0 | 0 | -| beo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ber | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| beu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bew | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bgc | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bgs | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bgt | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bhb | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bhd | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bhg | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bhl | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bho | 2 | 2 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | -| bhp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| big | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bjj | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bjk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bjn | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bjp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bjr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bjv | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bjz | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bkd | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bki | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bkq | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bkx | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| blw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| blz | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bmh | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bmk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bmr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bmu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bnp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bns | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| boa | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bod | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| boj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bon | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bos | 3 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| box | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| boy | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bpr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bps | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bqc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bqp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bra | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bre | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| brx | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bsj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bsn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bsp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bss | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bug | 2 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| buk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bul | 3 | 4 | 1 | 0 | 1 | 1 | 1 | 2 | 0 | 0 | 0 | -| bus | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bvd | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bvr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bxh | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| byr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| byx | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bzd | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bzh | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| bzj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| caa | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cab | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cac | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| caf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cak | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cao | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cap | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| car | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cat | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| cav | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cax | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cbc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cbi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cbk | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cbr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cbs | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cbt | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cbu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cbv | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cco | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ceb | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| cek | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ces | 4 | 5 | 2 | 0 | 1 | 1 | 1 | 2 | 0 | 0 | 0 | -| cgc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cha | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| chd | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| chf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| chk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| chq | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| chv | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| chz | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cjk | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cjo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cjv | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ckb | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| cle | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| clu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cme | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cmn | 4 | 10 | 4 | 0 | 0 | 3 | 4 | 10 | 9 | 0 | 0 | -| cmo | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cni | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cnl | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cnt | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| code | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 37 | 0 | 0 | 0 | -| cof | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| con | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cop | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cor | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cot | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cpa | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cpb | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cpc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cpu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cpy | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| crh | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| crn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| crx | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| csb | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cso | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| csy | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cta | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cth | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ctp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ctu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cub | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cuc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cui | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cuk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cut | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cux | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cwe | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cya | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| cym | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| daa | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| dad | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| dah | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| dan | 5 | 9 | 2 | 0 | 1 | 0 | 1 | 5 | 0 | 0 | 0 | -| ded | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| deu | 6 | 14 | 7 | 0 | 1 | 6 | 2 | 18 | 4 | 0 | 0 | -| dgc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| dgr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| dgz | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| dhg | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| dif | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| dik | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| div | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| dji | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| djk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| djr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| dob | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| doi | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| dop | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| dov | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| dsb | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| dtp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| dwr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| dww | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| dwy | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| dyu | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| dza | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| dzo | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ebk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| eko | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ell | 3 | 6 | 1 | 0 | 1 | 2 | 0 | 3 | 0 | 0 | 0 | -| emi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| emp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| eng | 16 | 143 | 16 | 3 | 1 | 8 | 8 | 91 | 13 | 2 | 1 | -| enq | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| epo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| eri | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ese | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| esk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| est | 2 | 2 | 1 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | -| etr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| eus | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| ewe | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| faa | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| fai | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| fao | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | -| far | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| fas | 1 | 4 | 0 | 0 | 0 | 1 | 2 | 9 | 0 | 0 | 0 | -| ffm | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| fij | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| fil | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| fin | 3 | 5 | 1 | 0 | 1 | 1 | 2 | 5 | 1 | 0 | 0 | -| fon | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| for | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| fra | 7 | 13 | 8 | 0 | 1 | 5 | 3 | 14 | 4 | 0 | 1 | -| fry | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| fuc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| fue | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| fuf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| fuh | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| fur | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| fuv | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| gah | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gai | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gam | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gaw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gaz | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| gbm | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gdn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gdr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| geb | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gfk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ghs | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gla | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gle | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| glg | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| glk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| glv | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gmv | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gng | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gnn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gnw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gof | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gom | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| grc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| grn | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| gsw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gub | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| guh | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gui | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| guj | 6 | 6 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | -| gul | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gum | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gun | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| guo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gup | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gux | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gvc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gvf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gvn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gvs | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gwi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gym | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| gyr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| hat | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| hau | 4 | 5 | 3 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | -| haw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| hbo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| hch | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| heb | 4 | 5 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| heg | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| hin | 9 | 12 | 2 | 0 | 0 | 1 | 2 | 10 | 2 | 0 | 0 | -| hix | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| hla | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| hlt | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| hmn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| hmo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| hne | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| hns | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| hop | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| hot | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| hrv | 4 | 3 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | -| hsb | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| hto | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| hub | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| hui | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| hun | 5 | 3 | 1 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | -| hus | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| huu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| huv | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| hvn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| hye | 3 | 3 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | -| ian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ibo | 3 | 5 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| ido | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ign | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ikk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ikw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ile | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ilo | 2 | 1 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| imo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ina | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| inb | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ind | 6 | 7 | 1 | 0 | 0 | 1 | 1 | 4 | 1 | 0 | 0 | -| ino | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| iou | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ipi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| isl | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| isn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ita | 5 | 9 | 1 | 0 | 1 | 2 | 1 | 5 | 3 | 0 | 0 | -| iws | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ixl | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| jac | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| jae | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| jao | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| jav | 4 | 7 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| jic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| jid | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| jiv | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| jni | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| jpn | 5 | 8 | 3 | 0 | 0 | 1 | 3 | 13 | 2 | 0 | 0 | -| jvn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kab | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kac | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| kam | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kan | 6 | 7 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | -| kaq | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kas | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kat | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | -| kaz | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| kbc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kbh | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kbm | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kbp | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kbq | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kdc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kde | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kdl | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kea | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| kek | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ken | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kew | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kfg | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kfy | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kgf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kgk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kgp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| khk | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| khm | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| khs | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| khz | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kik | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kin | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | -| kir | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| kiw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kiz | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kje | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kjs | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kkc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kkl | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| klt | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| klv | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kmb | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kmg | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kmh | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kmk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kmo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kmr | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kms | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kmu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| knc | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kne | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| knf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| knj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| knv | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kon | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kor | 4 | 8 | 1 | 0 | 1 | 2 | 1 | 8 | 3 | 0 | 0 | -| kos | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kpf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kpg | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kpj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kpr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kpw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kpx | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kqa | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kqc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kqf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kql | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kqw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| krc | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ksd | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ksj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ksr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ktm | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kto | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kud | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kue | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kup | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kur | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kvg | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kvn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kwd | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kwf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kwi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kwj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kyc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kyf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kyg | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kyq | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kyz | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kze | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| kzj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| lac | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| lao | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| lat | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| lav | 1 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | -| lbb | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| lbk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| lcm | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| leu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| lex | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| lfn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| lgl | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| lid | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| lif | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| lij | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| lim | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| lin | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| lit | 4 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | -| llg | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| lmo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ltg | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ltz | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| lua | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| lug | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| luo | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| lus | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| lvs | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| lww | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| maa | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mad | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mag | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mai | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| maj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mak | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mal | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | -| mam | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| maq | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mar | 7 | 6 | 2 | 0 | 0 | 1 | 0 | 2 | 2 | 0 | 0 | -| mau | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mav | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| max | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| maz | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mbb | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mbc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mbh | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mbj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mbl | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mbs | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mbt | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mca | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mcb | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mcd | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mcf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mco | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mcp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mcq | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mcr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mdy | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| med | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mee | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mek | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| meq | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| met | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| meu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mey | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mgc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mgh | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mgw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mhl | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mhr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mib | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mie | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mig | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mih | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mil | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| min | 3 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mio | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mir | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| miz | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mjc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mkd | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| mkj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mkl | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mkn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mks | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mle | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mlg | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mlh | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mlp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mlt | 2 | 2 | 2 | 0 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | -| mmo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mmx | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mna | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mni | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mon | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mop | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mos | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mox | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mph | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mpj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mpm | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mpp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mps | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mpt | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mpx | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mqb | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mqj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mri | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| msa | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| msb | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| msc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| msk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| msm | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| msy | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mti | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mto | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mui | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mup | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mux | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| muy | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mva | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mvn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mwc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mwe | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mwf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mwp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mwr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mxb | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mxp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mxq | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mxt | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mya | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| myk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| myu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| myw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| myy | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| mzz | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nab | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| naf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nak | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nas | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nbl | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nbq | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nca | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nch | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ncj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ncl | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ncu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nde | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ndg | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ndj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nds | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nep | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nfa | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ngp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ngu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nhe | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nhg | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nhi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nho | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nhr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nhu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nhw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nhy | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nif | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nii | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nij | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nin | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nko | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nld | 6 | 6 | 1 | 0 | 1 | 0 | 1 | 2 | 2 | 0 | 0 | -| nlg | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nna | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nno | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nnq | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| noa | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nob | 4 | 7 | 5 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | -| noe | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nop | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nor | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | -| not | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nou | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nov | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| npi | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| npl | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nqo | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nsn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nso | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| nss | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ntj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ntp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ntu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nus | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nuy | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nvm | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nwi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nya | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| nys | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| nyu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| obo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| oci | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| okv | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| omw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ong | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ons | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ood | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| opm | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ori | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| orm | 1 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| orv | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ory | 5 | 4 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | -| ote | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| otm | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| otn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| otq | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ots | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| pab | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| pad | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| pag | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| pah | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| pam | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| pan | 6 | 6 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | -| pao | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| pap | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| pbt | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| pcm | 1 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| pes | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| pib | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| pio | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| pir | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| piu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| pjt | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| pls | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| plt | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| plu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| pma | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| pms | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| poe | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| poh | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| poi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| pol | 4 | 11 | 4 | 0 | 1 | 4 | 0 | 18 | 4 | 0 | 0 | -| pon | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| por | 4 | 9 | 1 | 0 | 2 | 2 | 1 | 5 | 3 | 0 | 0 | -| poy | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ppo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| prf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| pri | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| prs | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ptp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ptu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| pus | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| pwg | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| qub | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| quc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| quf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| quh | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| qul | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| qup | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| quy | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| qvc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| qve | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| qvh | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| qvm | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| qvn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| qvs | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| qvw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| qvz | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| qwh | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| qxh | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| qxn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| qxo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| rai | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| raj | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| reg | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| rej | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| rgu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| rkb | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| rmc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| rmy | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| rom | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ron | 5 | 6 | 1 | 0 | 1 | 0 | 1 | 3 | 1 | 0 | 0 | -| roo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| rop | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| row | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| rro | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ruf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| rug | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| run | 1 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| rus | 5 | 13 | 6 | 0 | 2 | 4 | 2 | 16 | 4 | 0 | 0 | -| rwo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sab | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sag | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sah | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| san | 5 | 3 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | -| sat | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sbe | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sbk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sbs | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| scn | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sco | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| seh | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sey | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sgb | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sgz | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| shi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| shj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| shn | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| shp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sim | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sin | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| sja | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| slk | 3 | 4 | 1 | 0 | 1 | 0 | 0 | 3 | 0 | 0 | 0 | -| sll | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| slv | 3 | 4 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | -| smk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| smo | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sna | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| snc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| snd | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| snn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| snp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| snx | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sny | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| som | 3 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| soq | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sot | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| soy | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| spa | 4 | 13 | 4 | 0 | 1 | 2 | 2 | 12 | 4 | 0 | 0 | -| spl | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| spm | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| spp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sps | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| spy | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sqi | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| srd | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sri | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| srm | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| srn | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| srp | 4 | 1 | 1 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | -| srq | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ssd | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ssg | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ssw | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| ssx | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| stp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sua | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sue | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sun | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| sus | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| suz | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| svk | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| swa | 1 | 7 | 2 | 0 | 0 | 1 | 1 | 3 | 0 | 0 | 0 | -| swe | 4 | 8 | 3 | 0 | 1 | 1 | 1 | 4 | 0 | 0 | 0 | -| swg | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| swh | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| swp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| sxb | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| szl | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tac | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tah | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| taj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tam | 7 | 7 | 2 | 0 | 0 | 1 | 0 | 3 | 1 | 0 | 0 | -| taq | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tat | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tav | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| taw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tbc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tbf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tbg | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tbo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tbz | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tca | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tcs | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tcz | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tdt | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tee | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tel | 7 | 7 | 2 | 0 | 0 | 0 | 1 | 5 | 2 | 0 | 0 | -| ter | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tet | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tew | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tfr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tgk | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| tgl | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| tgo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tgp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tha | 4 | 8 | 1 | 0 | 0 | 1 | 1 | 6 | 0 | 0 | 0 | -| tif | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tim | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tir | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| tiw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tiy | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tke | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tku | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tlf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tmd | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tna | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tnc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tnk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tnn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tnp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| toc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tod | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tof | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| toj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ton | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| too | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| top | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tos | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tpa | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tpi | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tpt | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tpz | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| trc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tsn | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| tso | 1 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| tsw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ttc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tte | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tuc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tue | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tuf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tuk | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tum | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tuo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tur | 4 | 7 | 1 | 0 | 0 | 2 | 0 | 3 | 2 | 0 | 0 | -| tvk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| twi | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| txq | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| txu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tyv | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tzj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tzl | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tzm | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| tzo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ubr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ubu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| udu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| uig | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ukr | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| uli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ulk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| umb | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| upv | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ura | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| urb | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| urd | 7 | 8 | 2 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | -| uri | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| urt | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| urw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| usa | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| usp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| uvh | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| uvl | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| uzb | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| uzn | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| vec | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ven | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| vid | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| vie | 5 | 6 | 1 | 0 | 0 | 1 | 0 | 5 | 0 | 0 | 0 | -| viv | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| vmy | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| waj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| wal | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| wap | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| war | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| wat | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| wbi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| wbp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| wed | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| wer | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| wim | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| wiu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| wiv | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| wln | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| wmt | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| wmw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| wnc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| wnu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| wol | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| wos | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| wrk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| wro | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| wrs | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| wsk | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| wuu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| wuv | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| xav | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| xbi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| xed | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| xho | 3 | 3 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| xla | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| xnn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| xon | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| xsi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| xtd | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| xtm | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| yaa | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| yad | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| yal | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| yap | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| yaq | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| yby | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ycn | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ydd | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| yid | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| yka | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| yle | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| yml | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| yon | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| yor | 4 | 5 | 3 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | -| yrb | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| yre | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| yss | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| yue | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| yuj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| yut | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| yuw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| yva | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zaa | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zab | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zac | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zad | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zai | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zaj | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zam | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zao | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zap | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zar | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zas | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zat | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zav | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zaw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zca | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zga | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zho | 2 | 2 | 1 | 0 | 0 | 1 | 1 | 13 | 0 | 0 | 0 | -| zia | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ziw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zlm | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zos | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zpc | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zpl | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zpm | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zpo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zpq | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zpu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zpv | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zpz | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zsm | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| zsr | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| ztq | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zty | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| zul | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -| zyp | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| Total | 1394 | 795 | 304 | 3 | 28 | 67 | 50 | 456 | 85 | 2 | 2 | -======= -| ISO Code | Language | Family | BitextMining | Classification | Clustering | InstructionRetrieval | MultilabelClassification | PairClassification | Reranking | Retrieval | STS | Speed | Summarization | Sum | -|---|------|------|------|------|------|------|------|------|------|------|------|---| -| aai | Arifama-Miniafia | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aak | Ankave | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aau | Abau | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aaz | Amarasi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| abs | Ambonese Malay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| abt | Ambulas | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| abx | Inabaknon | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aby | Aneme Wake | Yareban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ace | Achinese | Austronesian | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| acf | Saint Lucian Creole French | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| acm | Mesopotamian Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| acq | Ta'izzi-Adeni Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| acr | Achi | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| acu | Achuar-Shiwiar | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| adz | Adzera | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aeb | Tunisian Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| aer | Eastern Arrernte | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aey | Amele | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| afr | Afrikaans | Indo-European | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 10 | -| agd | Agarabi | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agg | Angor | Senagi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agm | Angaataha | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agn | Agutaynen | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agr | Aguaruna | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agt | Central Cagayan Agta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agu | Aguacateco | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aia | Arosi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aii | Assyrian Neo-Aramaic | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ajp | South Levantine Arabic | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| aka | Akan | Atlantic-Congo | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ake | Akawaio | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| alp | Alune | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| alq | Algonquin | Algic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| als | Tosk Albanian | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| aly | Alyawarr | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ame | Yanesha' | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amf | Hamer-Banna | South Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amh | Amharic | Afro-Asiatic | 3 | 6 | 3 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 14 | -| amk | Ambai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amm | Ama (Papua New Guinea) | Left May | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amn | Amanab | Border | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amo | Amo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amp | Alamblak | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amr | Amarakaeri | Harakmbut | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amu | Guerrero Amuzgo | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amx | Anmatyerre | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ang | Old English (ca. 450-1100) | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| anh | Nend | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| anp | Angika | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| anv | Denya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aoi | Anindilyakwa | Gunwinyguan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aoj | Mufian | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aom | ร–mie | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aon | Bumbita Arapesh | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apb | Sa'a | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apc | Levantine Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| ape | Bukiyip | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apn | Apinayรฉ | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apr | Arop-Lokep | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apu | Apurinรฃ | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apw | Western Apache | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apz | Safeyoka | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ara | Arabic | Unclassified | 2 | 12 | 0 | 0 | 0 | 2 | 2 | 9 | 2 | 0 | 0 | 29 | -| arb | Standard Arabic | Afro-Asiatic | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 8 | -| are | Western Arrarnta | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| arl | Arabela | Zaparoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| arn | Mapudungun | Araucanian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| arp | Arapaho | Algic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| arq | Algerian Arabic | Afro-Asiatic | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | -| ars | Najdi Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| ary | Moroccan Arabic | Afro-Asiatic | 1 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 7 | -| arz | Egyptian Arabic | Afro-Asiatic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| asm | Assamese | Indo-European | 5 | 3 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 14 | -| aso | Dano | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ast | Asturian | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ata | Pele-Ata | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| atb | Zaiwa | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| atd | Ata Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| atg | Ivbie North-Okpela-Arhe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| att | Pamplona Atta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| auc | Waorani | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aui | Anuki | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| auy | Awiyaana | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| avt | Au | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| awa | Awadhi | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| awb | Awa (Papua New Guinea) | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| awk | Awabakal | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| awx | Awara | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ayr | Central Aymara | Aymaran | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| azb | South Azerbaijani | Turkic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| aze | Azerbaijani | Unclassified | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| azg | San Pedro Amuzgos Amuzgo | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| azj | North Azerbaijani | Turkic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| azz | Highland Puebla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bak | Bashkir | Turkic | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| bam | Bambara | Mande | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| ban | Balinese | Austronesian | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| bao | Waimaha | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bba | Baatonum | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bbb | Barai | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bbc | Batak Toba | Austronesian | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| bbr | Girawa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bch | Bariai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bco | Kaluli | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bdd | Bunama | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bea | Beaver | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bef | Benabena | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bel | Belarusian | Indo-European | 4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| bem | Bemba (Zambia) | Atlantic-Congo | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ben | Bengali | Indo-European | 7 | 9 | 2 | 0 | 0 | 1 | 2 | 6 | 1 | 0 | 0 | 28 | -| beo | Beami | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ber | Berber (Other) | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| beu | Blagar | Timor-Alor-Pantar | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bew | Betawi | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| bgc | Haryanvi | Indo-European | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| bgs | Tagabawa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bgt | Bughotu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bhb | Bhili | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bhd | Bhadrawahi | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bhg | Binandere | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bhl | Bimin | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bho | Bhojpuri | Indo-European | 2 | 2 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 | -| bhp | Bima | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| big | Biangai | Kunimaipan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjj | Kanauji | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjk | Barok | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjn | Banjar | Austronesian | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| bjp | Fanamaket | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjr | Binumarien | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjv | Bedjond | Central Sudanic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjz | Baruga | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bkd | Binukid | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bki | Baki | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bkq | Bakairรญ | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bkx | Baikeno | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| blw | Balangao | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| blz | Balantak | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bmh | Kein | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bmk | Ghayavi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bmr | Muinane | Boran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bmu | Somba-Siawari | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bnp | Bola | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bns | Bundeli | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| boa | Bora | Boran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bod | Tibetan | Sino-Tibetan | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | -| boj | Anjam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bon | Bine | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bos | Bosnian | Indo-European | 3 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| box | Buamu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| boy | Bodo (Central African Republic) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bpr | Koronadal Blaan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bps | Sarangani Blaan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bqc | Boko (Benin) | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bqp | Busa | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bra | Braj | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bre | Breton | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| brx | Bodo (India) | Sino-Tibetan | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| bsj | Bangwinji | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bsn | Barasana-Eduria | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bsp | Baga Sitemu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bss | Akoose | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bug | Buginese | Austronesian | 2 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| buk | Bugawac | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bul | Bulgarian | Indo-European | 3 | 4 | 1 | 0 | 1 | 1 | 1 | 2 | 0 | 0 | 0 | 13 | -| bus | Bokobaru | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bvd | Baeggu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bvr | Burarra | Maningrida | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bxh | Buhutu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| byr | Baruya | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| byx | Qaqet | Baining | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bzd | Bribri | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bzh | Mapos Buang | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bzj | Belize Kriol English | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| caa | Chortรญ | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cab | Garifuna | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cac | Chuj | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| caf | Southern Carrier | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cak | Kaqchikel | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cao | Chรกcobo | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cap | Chipaya | Uru-Chipaya | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| car | Galibi Carib | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cat | Catalan | Indo-European | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| cav | Cavineรฑa | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cax | Chiquitano | Chiquitano | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbc | Carapana | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbi | Chachi | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbk | Chavacano | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| cbr | Cashibo-Cacataibo | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbs | Cashinahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbt | Chayahuita | Cahuapanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbu | Candoshi-Shapra | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbv | Cacua | Kakua-Nukak | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cco | Comaltepec Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ceb | Cebuano | Austronesian | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | -| cek | Eastern Khumi Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ces | Czech | Indo-European | 4 | 5 | 2 | 0 | 1 | 1 | 1 | 2 | 0 | 0 | 0 | 16 | -| cgc | Kagayanen | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cha | Chamorro | Austronesian | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| chd | Highland Oaxaca Chontal | Tequistlatecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chf | Tabasco Chontal | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chk | Chuukese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chq | Quiotepec Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chv | Chuvash | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chz | Ozumacรญn Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cjk | Chokwe | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| cjo | Ashรฉninka Pajonal | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cjv | Chuave | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ckb | Central Kurdish | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | -| cle | Lealao Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| clu | Caluyanun | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cme | Cerma | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cmn | Mandarin Chinese | Sino-Tibetan | 4 | 10 | 4 | 0 | 0 | 3 | 4 | 10 | 9 | 0 | 0 | 44 | -| cmo | Central Mnong | Austroasiatic | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| cni | Ashรกninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cnl | Lalana Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cnt | Tepetotutla Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| code | unknown | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 37 | 0 | 0 | 0 | 37 | -| cof | Colorado | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| con | Cofรกn | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cop | Coptic | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cor | Cornish | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cot | Caquinte | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpa | Palantla Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpb | Ucayali-Yurรบa Ashรฉninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpc | Ajyรญninka Apurucayali | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpu | Pichis Ashรฉninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpy | South Ucayali Ashรฉninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| crh | Crimean Tatar | Turkic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| crn | El Nayar Cora | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| crx | Carrier | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| csb | Kashubian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cso | Sochiapam Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| csy | Siyin Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cta | Tataltepec Chatino | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cth | Thaiphum Chin | Bookkeeping | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ctp | Western Highland Chatino | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ctu | Chol | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cub | Cubeo | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cuc | Usila Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cui | Cuiba | Guahiboan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cuk | San Blas Kuna | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cut | Teutila Cuicatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cux | Tepeuxila Cuicatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cwe | Kwere | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cya | Nopala Chatino | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cym | Welsh | Indo-European | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| daa | Dangalรฉat | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dad | Marik | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dah | Gwahatike | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dan | Danish | Indo-European | 5 | 9 | 2 | 0 | 1 | 0 | 1 | 5 | 0 | 0 | 0 | 23 | -| ded | Dedua | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| deu | German | Indo-European | 6 | 14 | 7 | 0 | 1 | 6 | 2 | 18 | 4 | 0 | 0 | 58 | -| dgc | Casiguran Dumagat Agta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dgr | Dogrib | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dgz | Daga | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dhg | Dhangu-Djangu | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dif | Dieri | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dik | Southwestern Dinka | Nilotic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| div | Dhivehi | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dji | Djinang | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| djk | Eastern Maroon Creole | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| djr | Djambarrpuyngu | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dob | Dobu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| doi | Dogri (macrolanguage) | Unclassified | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| dop | Lukpa | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dov | Dombe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dsb | Lower Sorbian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dtp | Kadazan Dusun | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dwr | Dawro | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dww | Dawawa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dwy | Dhuwaya | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dyu | Dyula | Mande | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| dza | Tunzu | Atlantic-Congo | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dzo | Dzongkha | Sino-Tibetan | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ebk | Eastern Bontok | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eko | Koti | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ell | Modern Greek (1453-) | Indo-European | 3 | 6 | 1 | 0 | 1 | 2 | 0 | 3 | 0 | 0 | 0 | 16 | -| emi | Mussau-Emira | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| emp | Northern Emberรก | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eng | English | Indo-European | 16 | 143 | 16 | 3 | 1 | 8 | 8 | 105 | 13 | 2 | 1 | 316 | -| enq | Enga | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| epo | Esperanto | Artificial Language | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| eri | Ogea | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ese | Ese Ejja | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| esk | Northwest Alaska Inupiatun | Eskimo-Aleut | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| est | Estonian | Uralic | 2 | 2 | 1 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 8 | -| etr | Edolo | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eus | Basque | Unclassified | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| ewe | Ewe | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| faa | Fasu | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fai | Faiwol | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fao | Faroese | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 7 | -| far | Fataleka | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fas | Persian | Indo-European | 1 | 4 | 0 | 0 | 0 | 1 | 2 | 9 | 0 | 0 | 0 | 17 | -| ffm | Maasina Fulfulde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fij | Fijian | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| fil | Filipino | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| fin | Finnish | Uralic | 3 | 5 | 1 | 0 | 1 | 1 | 2 | 5 | 1 | 0 | 0 | 19 | -| fon | Fon | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| for | Fore | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fra | French | Indo-European | 7 | 13 | 8 | 0 | 1 | 5 | 3 | 15 | 4 | 0 | 1 | 57 | -| fry | Western Frisian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fuc | Pulaar | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fue | Borgu Fulfulde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fuf | Pular | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fuh | Western Niger Fulfulde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fur | Friulian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| fuv | Nigerian Fulfulde | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| gah | Alekano | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gai | Borei | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gam | Kandawo | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gaw | Nobonob | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gaz | West Central Oromo | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| gbm | Garhwali | Indo-European | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| gdn | Umanakaina | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gdr | Wipi | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| geb | Kire | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gfk | Patpatar | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ghs | Guhu-Samane | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gla | Scottish Gaelic | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| gle | Irish | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| glg | Galician | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| glk | Gilaki | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| glv | Manx | Indo-European | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gmv | Gamo | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gng | Ngangam | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gnn | Gumatj | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gnw | Western Bolivian Guaranรญ | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gof | Gofa | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gom | Goan Konkani | Indo-European | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| grc | Ancient Greek (to 1453) | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| grn | Guarani | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| gsw | Swiss German | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gub | Guajajรกra | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| guh | Guahibo | Guahiboan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gui | Eastern Bolivian Guaranรญ | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| guj | Gujarati | Indo-European | 6 | 6 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 18 | -| gul | Sea Island Creole English | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gum | Guambiano | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gun | Mbyรก Guaranรญ | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| guo | Guayabero | Guahiboan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gup | Gunwinggu | Gunwinyguan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gux | Gourmanchรฉma | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gvc | Guanano | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gvf | Golin | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gvn | Kuku-Yalanji | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gvs | Gumawana | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gwi | Gwichสผin | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gym | Ngรคbere | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gyr | Guarayu | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hat | Haitian | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| hau | Hausa | Afro-Asiatic | 4 | 5 | 3 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 14 | -| haw | Hawaiian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hbo | Ancient Hebrew | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hch | Huichol | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| heb | Hebrew | Afro-Asiatic | 4 | 5 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 11 | -| heg | Helong | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hin | Hindi | Indo-European | 9 | 12 | 2 | 0 | 0 | 1 | 2 | 10 | 2 | 0 | 0 | 38 | -| hix | Hixkaryรกna | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hla | Halia | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hlt | Matu Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hmn | Hmong | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hmo | Hiri Motu | Pidgin | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hne | Chhattisgarhi | Indo-European | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| hns | Caribbean Hindustani | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hop | Hopi | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hot | Hote | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hrv | Croatian | Indo-European | 4 | 3 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 10 | -| hsb | Upper Sorbian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hto | Minica Huitoto | Huitotoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hub | Huambisa | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hui | Huli | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hun | Hungarian | Uralic | 5 | 3 | 1 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 12 | -| hus | Huastec | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| huu | Murui Huitoto | Huitotoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| huv | San Mateo Del Mar Huave | Huavean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hvn | Sabu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hye | Armenian | Indo-European | 3 | 3 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 9 | -| ian | Iatmul | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ibo | Igbo | Atlantic-Congo | 3 | 5 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 12 | -| ido | Ido | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ign | Ignaciano | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ikk | Ika | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ikw | Ikwere | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ile | Interlingue | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ilo | Iloko | Austronesian | 2 | 1 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | -| imo | Imbongu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ina | Interlingua (International Auxiliary Language Association) | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| inb | Inga | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ind | Indonesian | Austronesian | 6 | 7 | 1 | 0 | 0 | 1 | 1 | 4 | 1 | 0 | 0 | 21 | -| ino | Inoke-Yate | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| iou | Tuma-Irumu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ipi | Ipili | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| isl | Icelandic | Indo-European | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 9 | -| isn | Isanzu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ita | Italian | Indo-European | 5 | 9 | 1 | 0 | 1 | 2 | 1 | 5 | 3 | 0 | 0 | 27 | -| iws | Sepik Iwam | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ixl | Ixil | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jac | Popti' | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jae | Yabem | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jao | Yanyuwa | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jav | Javanese | Austronesian | 4 | 7 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 13 | -| jic | Tol | Jicaquean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jid | Bu (Kaduna State) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jiv | Shuar | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jni | Janji | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jpn | Japanese | Japonic | 5 | 8 | 3 | 0 | 0 | 1 | 3 | 13 | 2 | 0 | 0 | 35 | -| jvn | Caribbean Javanese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kab | Kabyle | Afro-Asiatic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| kac | Kachin | Sino-Tibetan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| kam | Kamba (Kenya) | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kan | Kannada | Dravidian | 6 | 7 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 19 | -| kaq | Capanahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kas | Kashmiri | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| kat | Georgian | Kartvelian | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 10 | -| kaz | Kazakh | Turkic | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| kbc | Kadiwรฉu | Guaicuruan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kbh | Camsรก | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kbm | Iwal | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kbp | Kabiyรจ | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kbq | Kamano | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kdc | Kutu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kde | Makonde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kdl | Tsikimba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kea | Kabuverdianu | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| kek | Kekchรญ | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ken | Kenyang | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kew | West Kewa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kfg | Kudiya | Dravidian | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kfy | Kumaoni | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kgf | Kube | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kgk | Kaiwรก | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kgp | Kaingang | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| khk | Halh Mongolian | Mongolic-Khitan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| khm | Khmer | Austroasiatic | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| khs | Kasua | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| khz | Keapara | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kik | Kikuyu | Atlantic-Congo | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| kin | Kinyarwanda | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 8 | -| kir | Kirghiz | Turkic | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | -| kiw | Northeast Kiwai | Kiwaian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kiz | Kisi | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kje | Kisar | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kjs | East Kewa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kkc | Odoodee | East Strickland | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kkl | Kosarek Yale | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| klt | Nukna | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| klv | Maskelynes | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmb | Kimbundu | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kmg | Kรขte | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmh | Kalam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmk | Limos Kalinga | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmo | Kwoma | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmr | Northern Kurdish | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| kms | Kamasau | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmu | Kanite | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| knc | Central Kanuri | Saharan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kne | Kankanaey | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| knf | Mankanya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| knj | Western Kanjobal | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| knv | Tabo | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kon | Kongo | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kor | Korean | Koreanic | 4 | 8 | 1 | 0 | 1 | 2 | 1 | 9 | 3 | 0 | 0 | 29 | -| kos | Kosraean | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpf | Komba | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpg | Kapingamarangi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpj | Karajรก | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpr | Korafe-Yegha | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpw | Kobon | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpx | Mountain Koiali | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kqa | Mum | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kqc | Doromu-Koki | Manubaran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kqf | Kakabai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kql | Kyenele | Yuat | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kqw | Kandas | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| krc | Karachay-Balkar | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ksd | Kuanua | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ksj | Uare | Kwalean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ksr | Borong | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ktm | Kurti | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kto | Kuot | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kud | 'Auhelawa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kue | Kuman (Papua New Guinea) | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kup | Kunimaipa | Kunimaipan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kur | Kurdish | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kvg | Kuni-Boazi | Anim | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kvn | Border Kuna | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kwd | Kwaio | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kwf | Kwara'ae | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kwi | Awa-Cuaiquer | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kwj | Kwanga | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyc | Kyaka | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyf | Kouya | Kru | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyg | Keyagana | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyq | Kenga | Central Sudanic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyz | Kayabรญ | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kze | Kosena | Bookkeeping | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kzj | Coastal Kadazan | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lac | Lacandon | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lao | Lao | Tai-Kadai | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| lat | Latin | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| lav | Latvian | Indo-European | 1 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| lbb | Label | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lbk | Central Bontok | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lcm | Tungag | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| leu | Kara (Papua New Guinea) | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lex | Luang | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lfn | Lingua Franca Nova | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lgl | Wala | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lid | Nyindrou | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lif | Limbu | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lij | Ligurian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| lim | Limburgan | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| lin | Lingala | Atlantic-Congo | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| lit | Lithuanian | Indo-European | 4 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| llg | Lole | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lmo | Lombard | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| ltg | Latgalian | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| ltz | Luxembourgish | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| lua | Luba-Lulua | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| lug | Ganda | Atlantic-Congo | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| luo | Luo (Kenya and Tanzania) | Nilotic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| lus | Lushai | Sino-Tibetan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| lvs | Standard Latvian | Unclassified | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| lww | Lewo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| maa | San Jerรณnimo Tecรณatl Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mad | Madurese | Austronesian | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| mag | Magahi | Indo-European | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| mai | Maithili | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| maj | Jalapa De Dรญaz Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mak | Makasar | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| mal | Malayalam | Dravidian | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 19 | -| mam | Mam | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| maq | Chiquihuitlรกn Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mar | Marathi | Indo-European | 7 | 6 | 2 | 0 | 0 | 1 | 0 | 2 | 2 | 0 | 0 | 20 | -| mau | Huautla Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mav | Saterรฉ-Mawรฉ | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| max | North Moluccan Malay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| maz | Central Mazahua | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbb | Western Bukidnon Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbc | Macushi | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbh | Mangseng | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbj | Nadรซb | Naduhup | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbl | Maxakalรญ | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbs | Sarangani Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbt | Matigsalug Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mca | Maca | Mataguayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcb | Machiguenga | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcd | Sharanahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcf | Matsรฉs | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mco | Coatlรกn Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcp | Makaa | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcq | Ese | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcr | Menya | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mdy | Male (Ethiopia) | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| med | Melpa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mee | Mengen | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mek | Mekeo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| meq | Merey | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| met | Mato | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| meu | Motu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mey | Hassaniyya | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mgc | Morokodo | Central Sudanic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mgh | Makhuwa-Meetto | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mgw | Matumbi | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mhl | Mauwake | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mhr | Eastern Mari | Uralic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mib | Atatlรกhuca Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mic | Mi'kmaq | Algic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mie | Ocotepec Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mig | San Miguel El Grande Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mih | Chayuco Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mil | Peรฑoles Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| min | Minangkabau | Austronesian | 3 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | -| mio | Pinotepa Nacional Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mir | Isthmus Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mit | Southern Puebla Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| miz | Coatzospan Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mjc | San Juan Colorado Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mkd | Macedonian | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | -| mkj | Mokilese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mkl | Mokole | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mkn | Kupang Malay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mks | Silacayoapan Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mle | Manambu | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mlg | Malagasy | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mlh | Mape | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mlp | Bargam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mlt | Maltese | Afro-Asiatic | 2 | 2 | 2 | 0 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 9 | -| mmo | Mangga Buang | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mmx | Madak | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mna | Mbula | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mni | Manipuri | Sino-Tibetan | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| mon | Mongolian | Unclassified | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| mop | Mopรกn Maya | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mos | Mossi | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| mox | Molima | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mph | Maung | Iwaidjan Proper | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpj | Martu Wangka | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpm | Yosondรบa Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpp | Migabac | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mps | Dadibi | Teberan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpt | Mian | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpx | Misima-Panaeati | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mqb | Mbuko | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mqj | Mamasa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mri | Maori | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| msa | Malay (macrolanguage) | Unclassified | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| msb | Masbatenyo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| msc | Sankaran Maninka | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| msk | Mansaka | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| msm | Agusan Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| msy | Aruamu | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mti | Maiwa (Papua New Guinea) | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mto | Totontepec Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mui | Musi | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| mup | Malvi | Indo-European | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| mux | Bo-Ung | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| muy | Muyang | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mva | Manam | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mvn | Minaveha | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwc | Are | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwe | Mwera (Chimwera) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwf | Murrinh-Patha | Southern Daly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwp | Kala Lagaw Ya | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwr | Marwari | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mxb | Tezoatlรกn Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mxp | Tlahuitoltepec Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mxq | Juquila Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mxt | Jamiltepec Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mya | Burmese | Sino-Tibetan | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 9 | -| myk | Mamara Senoufo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| myu | Mundurukรบ | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| myw | Muyuw | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| myy | Macuna | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mzz | Maiadomu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nab | Southern Nambikuรกra | Nambiquaran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| naf | Nabak | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nak | Nakanai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nas | Naasioi | South Bougainville | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nbl | South Ndebele | Unclassified | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nbq | Nggem | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nca | Iyo | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nch | Central Huasteca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ncj | Northern Puebla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ncl | Michoacรกn Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ncu | Chumburung | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nde | North Ndebele | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ndg | Ndengereko | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ndj | Ndamba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nds | Low German | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nep | Nepali (macrolanguage) | Unclassified | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| nfa | Dhao | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ngp | Ngulu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ngu | Guerrero Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhe | Eastern Huasteca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhg | Tetelcingo Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhi | Zacatlรกn-Ahuacatlรกn-Tepetzintla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nho | Takuu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhr | Naro | Khoe-Kwadi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhu | Noone | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhw | Western Huasteca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhy | Northern Oaxaca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nif | Nek | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nii | Nii | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nij | Ngaju | Austronesian | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| nin | Ninzo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nko | Nkonya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nld | Dutch | Indo-European | 6 | 6 | 1 | 0 | 1 | 0 | 1 | 2 | 2 | 0 | 0 | 19 | -| nlg | Gela | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nna | Nyangumarta | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nno | Norwegian Nynorsk | Unclassified | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| nnq | Ngindo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| noa | Woun Meu | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nob | Norwegian Bokmรฅl | Unclassified | 4 | 7 | 5 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 19 | -| noe | Nimadi | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nop | Numanggang | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nor | Norwegian | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 3 | -| not | Nomatsiguenga | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nou | Ewage-Notu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nov | Novial | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| npi | Nepali (individual language) | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| npl | Southeastern Puebla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nqo | N'Ko | Artificial Language | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| nsn | Nehan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nso | Pedi | Atlantic-Congo | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | -| nss | Nali | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ntj | Ngaanyatjarra | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ntp | Northern Tepehuan | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ntu | Natรผgu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nus | Nuer | Nilotic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| nuy | Nunggubuyu | Gunwinyguan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nvm | Namiae | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nwi | Southwest Tanna | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nya | Nyanja | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | -| nys | Nyungar | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nyu | Nyungwe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| obo | Obo Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| oci | Occitan (post 1500) | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| okv | Orokaiva | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| omw | South Tairora | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ong | Olo | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ons | Ono | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ood | Tohono O'odham | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| opm | Oksapmin | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ori | Oriya (macrolanguage) | Unclassified | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| orm | Oromo | Unclassified | 1 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| orv | Old Russian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ory | Odia | Indo-European | 5 | 4 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 15 | -| ote | Mezquital Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| otm | Eastern Highland Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| otn | Tenango Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| otq | Querรฉtaro Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ots | Estado de Mรฉxico Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pab | Parecรญs | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pad | Paumarรญ | Arawan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pag | Pangasinan | Austronesian | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| pah | Tenharim | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pam | Pampanga | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pan | Panjabi | Indo-European | 6 | 6 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 18 | -| pao | Northern Paiute | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pap | Papiamento | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| pbt | Southern Pashto | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| pcm | Nigerian Pidgin | Indo-European | 1 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| pes | Iranian Persian | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | -| pib | Yine | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pio | Piapoco | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pir | Piratapuyo | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| piu | Pintupi-Luritja | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pjt | Pitjantjatjara | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pls | San Marcos Tlacoyalco Popoloca | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| plt | Plateau Malagasy | Austronesian | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| plu | Palikรบr | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pma | Paama | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pms | Piemontese | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| poe | San Juan Atzingo Popoloca | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| poh | Poqomchi' | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| poi | Highland Popoluca | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pol | Polish | Indo-European | 4 | 11 | 4 | 0 | 1 | 4 | 0 | 18 | 4 | 0 | 0 | 46 | -| pon | Pohnpeian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| por | Portuguese | Indo-European | 4 | 9 | 1 | 0 | 2 | 2 | 1 | 5 | 3 | 0 | 0 | 27 | -| poy | Pogolo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ppo | Folopa | Teberan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| prf | Paranan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pri | Paicรฎ | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| prs | Dari | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ptp | Patep | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ptu | Bambam | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pus | Pushto | Unclassified | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| pwg | Gapapaiwa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qub | Huallaga Huรกnuco Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| quc | K'iche' | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| quf | Lambayeque Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| quh | South Bolivian Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qul | North Bolivian Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qup | Southern Pastaza Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| quy | Ayacucho Quechua | Quechuan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| qvc | Cajamarca Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qve | Eastern Apurรญmac Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvh | Huamalรญes-Dos de Mayo Huรกnuco Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvm | Margos-Yarowilca-Lauricocha Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvn | North Junรญn Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvs | San Martรญn Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvw | Huaylla Wanca Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvz | Northern Pastaza Quichua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qwh | Huaylas Ancash Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qxh | Panao Huรกnuco Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qxn | Northern Conchucos Ancash Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qxo | Southern Conchucos Ancash Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rai | Ramoaaina | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| raj | Rajasthani | Unclassified | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| reg | Kara (Tanzania) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rej | Rejang | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| rgu | Ringgou | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rkb | Rikbaktsa | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rmc | Carpathian Romani | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rmy | Vlax Romani | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rom | Romany | Unclassified | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| ron | Romanian | Indo-European | 5 | 6 | 1 | 0 | 1 | 0 | 1 | 3 | 1 | 0 | 0 | 18 | -| roo | Rotokas | North Bougainville | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rop | Kriol | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| row | Dela-Oenale | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rro | Waima | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ruf | Luguru | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rug | Roviana | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| run | Rundi | Atlantic-Congo | 1 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| rus | Russian | Indo-European | 5 | 13 | 6 | 0 | 2 | 4 | 2 | 16 | 4 | 0 | 0 | 52 | -| rwo | Rawa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sab | Buglere | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sag | Sango | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| sah | Yakut | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| san | Sanskrit | Indo-European | 5 | 3 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 10 | -| sat | Santali | Austroasiatic | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| sbe | Saliba | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sbk | Safwa | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sbs | Subiya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| scn | Sicilian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| sco | Scots | Indo-European | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| seh | Sena | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sey | Secoya | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sgb | Mag-antsi Ayta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sgz | Sursurunga | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| shi | Tachelhit | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| shj | Shatt | Dajuic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| shn | Shan | Tai-Kadai | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| shp | Shipibo-Conibo | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sim | Mende (Papua New Guinea) | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sin | Sinhala | Indo-European | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | -| sja | Epena | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| slk | Slovak | Indo-European | 3 | 4 | 1 | 0 | 1 | 0 | 0 | 3 | 0 | 0 | 0 | 12 | -| sll | Salt-Yui | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| slv | Slovenian | Indo-European | 3 | 4 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 10 | -| smk | Bolinao | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| smo | Samoan | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| sna | Shona | Atlantic-Congo | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| snc | Sinaugoro | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| snd | Sindhi | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| snn | Siona | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| snp | Siane | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| snx | Sam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sny | Saniyo-Hiyewe | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| som | Somali | Afro-Asiatic | 3 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 9 | -| soq | Kanasi | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sot | Southern Sotho | Atlantic-Congo | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| soy | Miyobe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spa | Spanish | Indo-European | 4 | 13 | 4 | 0 | 1 | 2 | 2 | 13 | 4 | 0 | 0 | 43 | -| spl | Selepet | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spm | Akukem | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spp | Supyire Senoufo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sps | Saposa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spy | Sabaot | Nilotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sqi | Albanian | Unclassified | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| srd | Sardinian | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| sri | Siriano | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| srm | Saramaccan | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| srn | Sranan Tongo | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| srp | Serbian | Indo-European | 4 | 1 | 1 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 9 | -| srq | Sirionรณ | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ssd | Siroi | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ssg | Seimat | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ssw | Swati | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | -| ssx | Samberigi | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| stp | Southeastern Tepehuan | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sua | Sulka | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sue | Suena | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sun | Sundanese | Austronesian | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 9 | -| sus | Susu | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| suz | Sunwar | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| svk | Slovakian Sign Language | Sign Language | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| swa | Swahili (macrolanguage) | Atlantic-Congo | 1 | 7 | 2 | 0 | 0 | 1 | 1 | 3 | 0 | 0 | 0 | 15 | -| swe | Swedish | Indo-European | 4 | 8 | 3 | 0 | 1 | 1 | 1 | 4 | 0 | 0 | 0 | 22 | -| swg | Swabian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| swh | Swahili (individual language) | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | -| swp | Suau | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sxb | Suba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| szl | Silesian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| tac | Lowland Tarahumara | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tah | Tahitian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| taj | Eastern Tamang | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tam | Tamil | Dravidian | 7 | 7 | 2 | 0 | 0 | 1 | 0 | 3 | 1 | 0 | 0 | 21 | -| taq | Tamasheq | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| tat | Tatar | Turkic | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| tav | Tatuyo | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| taw | Tai | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbc | Takia | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbf | Mandara | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbg | North Tairora | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbo | Tawala | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbz | Ditammari | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tca | Ticuna | Ticuna-Yuri | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tcs | Torres Strait Creole | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tcz | Thado Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tdt | Tetun Dili | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tee | Huehuetla Tepehua | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tel | Telugu | Dravidian | 7 | 7 | 2 | 0 | 0 | 0 | 1 | 5 | 2 | 0 | 0 | 24 | -| ter | Tereno | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tet | Tetum | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tew | Tewa (USA) | Kiowa-Tanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tfr | Teribe | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tgk | Tajik | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | -| tgl | Tagalog | Austronesian | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| tgo | Sudest | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tgp | Tangoa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tha | Thai | Tai-Kadai | 4 | 8 | 1 | 0 | 0 | 1 | 1 | 6 | 0 | 0 | 0 | 21 | -| tif | Tifal | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tim | Timbe | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tir | Tigrinya | Afro-Asiatic | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| tiw | Tiwi | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tiy | Tiruray | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tke | Takwane | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tku | Upper Necaxa Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tlf | Telefol | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tmd | Haruai | Piawi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tna | Tacana | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tnc | Tanimuca-Retuarรฃ | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tnk | Kwamera | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tnn | North Tanna | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tnp | Whitesands | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| toc | Coyutla Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tod | Toma | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tof | Gizrra | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| toj | Tojolabal | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ton | Tonga (Tonga Islands) | Austronesian | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| too | Xicotepec De Juรกrez Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| top | Papantla Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tos | Highland Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tpa | Taupota | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tpi | Tok Pisin | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| tpt | Tlachichilco Tepehua | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tpz | Tinputz | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| trc | Copala Triqui | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tsn | Tswana | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | -| tso | Tsonga | Atlantic-Congo | 1 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | -| tsw | Tsishingini | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ttc | Tektiteko | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tte | Bwanabwana | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tuc | Mutu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tue | Tuyuca | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tuf | Central Tunebo | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tuk | Turkmen | Turkic | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| tum | Tumbuka | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| tuo | Tucano | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tur | Turkish | Turkic | 4 | 7 | 1 | 0 | 0 | 2 | 0 | 3 | 2 | 0 | 0 | 19 | -| tvk | Southeast Ambrym | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| twi | Twi | Unclassified | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| txq | Tii | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| txu | Kayapรณ | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tyv | Tuvinian | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tzj | Tz'utujil | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tzl | Talossan | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tzm | Central Atlas Tamazight | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| tzo | Tzotzil | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ubr | Ubir | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ubu | Umbu-Ungu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| udu | Uduk | Koman | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| uig | Uighur | Turkic | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| ukr | Ukrainian | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| uli | Ulithian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ulk | Meriam Mir | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| umb | Umbundu | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| upv | Uripiv-Wala-Rano-Atchin | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ura | Urarina | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| urb | Urubรบ-Kaapor | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| urd | Urdu | Indo-European | 7 | 8 | 2 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 19 | -| uri | Urim | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| urt | Urat | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| urw | Sop | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| usa | Usarufa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| usp | Uspanteco | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| uvh | Uri | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| uvl | Lote | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| uzb | Uzbek | Unclassified | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| uzn | Northern Uzbek | Turkic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| vec | Venetian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| ven | Venda | Atlantic-Congo | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| vid | Vidunda | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| vie | Vietnamese | Austroasiatic | 5 | 6 | 1 | 0 | 0 | 1 | 0 | 5 | 0 | 0 | 0 | 18 | -| viv | Iduna | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| vmy | Ayautla Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| waj | Waffa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wal | Wolaytta | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wap | Wapishana | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| war | Waray (Philippines) | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| wat | Kaninuwa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wbi | Vwanji | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wbp | Warlpiri | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wed | Wedau | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wer | Weri | Kunimaipan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wim | Wik-Mungkan | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wiu | Wiru | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wiv | Vitu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wln | Walloon | Indo-European | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wmt | Walmajarri | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wmw | Mwani | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wnc | Wantoat | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wnu | Usan | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wol | Wolof | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | -| wos | Hanga Hundi | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wrk | Garrwa | Garrwan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wro | Worrorra | Worrorran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wrs | Waris | Border | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wsk | Waskia | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wuu | Wu Chinese | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wuv | Wuvulu-Aua | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xav | Xavรกnte | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xbi | Kombio | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xed | Hdi | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xho | Xhosa | Atlantic-Congo | 3 | 3 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 10 | -| xla | Kamula | Kamula-Elevala | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xnn | Northern Kankanay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xon | Konkomba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xsi | Sio | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xtd | Diuxi-Tilantongo Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xtm | Magdalena Peรฑasco Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yaa | Yaminahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yad | Yagua | Peba-Yagua | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yal | Yalunka | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yap | Yapese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yaq | Yaqui | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yby | Yaweyuha | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ycn | Yucuna | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ydd | Eastern Yiddish | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| yid | Yiddish | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yka | Yakan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yle | Yele | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yml | Iamalele | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yon | Yongkom | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yor | Yoruba | Atlantic-Congo | 4 | 5 | 3 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 16 | -| yrb | Yareba | Yareban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yre | Yaourรฉ | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yss | Yessan-Mayo | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yue | Yue Chinese | Sino-Tibetan | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| yuj | Karkar-Yuri | Pauwasi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yut | Yopno | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yuw | Yau (Morobe Province) | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yva | Yawa | Yawa-Saweru | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zaa | Sierra de Juรกrez Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zab | Western Tlacolula Valley Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zac | Ocotlรกn Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zad | Cajonos Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zai | Isthmus Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zaj | Zaramo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zam | Miahuatlรกn Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zao | Ozolotepec Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zap | Zapotec | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zar | Rincรณn Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zas | Santo Domingo Albarradas Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zat | Tabaa Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zav | Yatzachi Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zaw | Mitla Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zca | Coatecas Altas Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zga | Kinga | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zho | Chinese | Unclassified | 2 | 2 | 1 | 0 | 0 | 1 | 1 | 13 | 0 | 0 | 0 | 20 | -| zia | Zia | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ziw | Zigula | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zlm | Malay (individual language) | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zos | Francisco Leรณn Zoque | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpc | Choapan Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpl | Lachixรญo Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpm | Mixtepec Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpo | Amatlรกn Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpq | Zoogocho Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpu | Yalรกlag Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpv | Chichicapan Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpz | Texmelucan Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zsm | Standard Malay | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| zsr | Southern Rincon Zapotec | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ztq | Quioquitani-Quierรญ Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zty | Yatee Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zul | Zulu | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | -| zyp | Zyphe Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| Total | None | None | None | 1394 | 795 | 304 | 3 | 28 | 67 | 51 | 473 | 85 | 2 | 2 | ->>>>>>> main +| ISO Code | Language | Family | Any2AnyMultiChoice | Any2AnyRetrieval | Any2TextMutipleChoice | BitextMining | Classification | Clustering | ImageClassification | ImageClustering | ImageMultilabelClassification | ImageTextPairClassification | InstructionRetrieval | MultilabelClassification | PairClassification | Reranking | Retrieval | STS | Speed | Summarization | VisualSTS | ZeroShotClassification | Sum | +|---|------|------|------|------|------|------|------|------|------|------|------|------|------|------|------|------|------|------|------|------|---| +| aai | Arifama-Miniafia | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aak | Ankave | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aau | Abau | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aaz | Amarasi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| abs | Ambonese Malay | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| abt | Ambulas | Ndu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| abx | Inabaknon | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aby | Aneme Wake | Yareban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ace | Achinese | Austronesian | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| acf | Saint Lucian Creole French | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| acm | Mesopotamian Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | +| acq | Ta'izzi-Adeni Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| acr | Achi | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| acu | Achuar-Shiwiar | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| adz | Adzera | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aeb | Tunisian Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| aer | Eastern Arrernte | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aey | Amele | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| afr | Afrikaans | Indo-European | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 10 | +| agd | Agarabi | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agg | Angor | Senagi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agm | Angaataha | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agn | Agutaynen | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agr | Aguaruna | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agt | Central Cagayan Agta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agu | Aguacateco | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aia | Arosi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aii | Assyrian Neo-Aramaic | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ajp | South Levantine Arabic | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| aka | Akan | Atlantic-Congo | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ake | Akawaio | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| alp | Alune | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| alq | Algonquin | Algic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| als | Tosk Albanian | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 5 | +| aly | Alyawarr | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ame | Yanesha' | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amf | Hamer-Banna | South Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amh | Amharic | Afro-Asiatic | 0 | 0 | 0 | 3 | 6 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 14 | +| amk | Ambai | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amm | Ama (Papua New Guinea) | Left May | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amn | Amanab | Border | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amo | Amo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amp | Alamblak | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amr | Amarakaeri | Harakmbut | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amu | Guerrero Amuzgo | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amx | Anmatyerre | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ang | Old English (ca. 450-1100) | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| anh | Nend | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| anp | Angika | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| anv | Denya | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aoi | Anindilyakwa | Gunwinyguan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aoj | Mufian | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aom | ร–mie | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aon | Bumbita Arapesh | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apb | Sa'a | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apc | Levantine Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | +| ape | Bukiyip | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apn | Apinayรฉ | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apr | Arop-Lokep | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apu | Apurinรฃ | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apw | Western Apache | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apz | Safeyoka | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ara | Arabic | Unclassified | 0 | 2 | 0 | 2 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 9 | 2 | 0 | 0 | 1 | 0 | 32 | +| arb | Standard Arabic | Afro-Asiatic | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 8 | +| are | Western Arrarnta | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| arl | Arabela | Zaparoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| arn | Mapudungun | Araucanian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| arp | Arapaho | Algic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| arq | Algerian Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | +| ars | Najdi Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | +| ary | Moroccan Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 7 | +| arz | Egyptian Arabic | Afro-Asiatic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 5 | +| asm | Assamese | Indo-European | 0 | 0 | 0 | 5 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 14 | +| aso | Dano | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ast | Asturian | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ata | Pele-Ata | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| atb | Zaiwa | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| atd | Ata Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| atg | Ivbie North-Okpela-Arhe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| att | Pamplona Atta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| auc | Waorani | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aui | Anuki | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| auy | Awiyaana | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| avt | Au | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| awa | Awadhi | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| awb | Awa (Papua New Guinea) | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| awk | Awabakal | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| awx | Awara | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ayr | Central Aymara | Aymaran | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| azb | South Azerbaijani | Turkic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| aze | Azerbaijani | Unclassified | 0 | 0 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| azg | San Pedro Amuzgos Amuzgo | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| azj | North Azerbaijani | Turkic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | +| azz | Highland Puebla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bak | Bashkir | Turkic | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| bam | Bambara | Mande | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 5 | +| ban | Balinese | Austronesian | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| bao | Waimaha | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bba | Baatonum | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bbb | Barai | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bbc | Batak Toba | Austronesian | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| bbr | Girawa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bch | Bariai | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bco | Kaluli | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bdd | Bunama | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bea | Beaver | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bef | Benabena | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bel | Belarusian | Indo-European | 0 | 0 | 0 | 4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| bem | Bemba (Zambia) | Atlantic-Congo | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ben | Bengali | Indo-European | 0 | 1 | 0 | 7 | 9 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 6 | 1 | 0 | 0 | 0 | 0 | 29 | +| beo | Beami | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ber | Berber (Other) | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| beu | Blagar | Timor-Alor-Pantar | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bew | Betawi | Austronesian | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| bgc | Haryanvi | Indo-European | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| bgs | Tagabawa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bgt | Bughotu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bhb | Bhili | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bhd | Bhadrawahi | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bhg | Binandere | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bhl | Bimin | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bho | Bhojpuri | Indo-European | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| bhp | Bima | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| big | Biangai | Kunimaipan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjj | Kanauji | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjk | Barok | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjn | Banjar | Austronesian | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| bjp | Fanamaket | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjr | Binumarien | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjv | Bedjond | Central Sudanic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjz | Baruga | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bkd | Binukid | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bki | Baki | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bkq | Bakairรญ | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bkx | Baikeno | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| blw | Balangao | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| blz | Balantak | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bmh | Kein | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bmk | Ghayavi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bmr | Muinane | Boran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bmu | Somba-Siawari | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bnp | Bola | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bns | Bundeli | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| boa | Bora | Boran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bod | Tibetan | Sino-Tibetan | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 | +| boj | Anjam | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bon | Bine | Eastern Trans-Fly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bos | Bosnian | Indo-European | 0 | 0 | 0 | 3 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| box | Buamu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| boy | Bodo (Central African Republic) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bpr | Koronadal Blaan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bps | Sarangani Blaan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bqc | Boko (Benin) | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bqp | Busa | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bra | Braj | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bre | Breton | Indo-European | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| brx | Bodo (India) | Sino-Tibetan | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| bsj | Bangwinji | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bsn | Barasana-Eduria | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bsp | Baga Sitemu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bss | Akoose | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bug | Buginese | Austronesian | 0 | 0 | 0 | 2 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| buk | Bugawac | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bul | Bulgarian | Indo-European | 0 | 1 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 14 | +| bus | Bokobaru | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bvd | Baeggu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bvr | Burarra | Maningrida | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bxh | Buhutu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| byr | Baruya | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| byx | Qaqet | Baining | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bzd | Bribri | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bzh | Mapos Buang | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bzj | Belize Kriol English | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| caa | Chortรญ | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cab | Garifuna | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cac | Chuj | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| caf | Southern Carrier | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cak | Kaqchikel | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cao | Chรกcobo | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cap | Chipaya | Uru-Chipaya | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| car | Galibi Carib | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cat | Catalan | Indo-European | 0 | 0 | 0 | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 | +| cav | Cavineรฑa | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cax | Chiquitano | Chiquitano | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbc | Carapana | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbi | Chachi | Barbacoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbk | Chavacano | Indo-European | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| cbr | Cashibo-Cacataibo | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbs | Cashinahua | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbt | Chayahuita | Cahuapanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbu | Candoshi-Shapra | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbv | Cacua | Kakua-Nukak | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cco | Comaltepec Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ceb | Cebuano | Austronesian | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 | +| cek | Eastern Khumi Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ces | Czech | Indo-European | 0 | 1 | 0 | 4 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 18 | +| cgc | Kagayanen | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cha | Chamorro | Austronesian | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| chd | Highland Oaxaca Chontal | Tequistlatecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chf | Tabasco Chontal | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chk | Chuukese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chq | Quiotepec Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chv | Chuvash | Turkic | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chz | Ozumacรญn Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cjk | Chokwe | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| cjo | Ashรฉninka Pajonal | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cjv | Chuave | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ckb | Central Kurdish | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 | +| cle | Lealao Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| clu | Caluyanun | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cme | Cerma | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cmn | Mandarin Chinese | Sino-Tibetan | 0 | 0 | 0 | 4 | 10 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 4 | 10 | 9 | 0 | 0 | 1 | 0 | 45 | +| cmo | Central Mnong | Austroasiatic | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| cni | Ashรกninka | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cnl | Lalana Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cnt | Tepetotutla Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| code | unknown | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 37 | 0 | 0 | 0 | 0 | 0 | 41 | +| cof | Colorado | Barbacoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| con | Cofรกn | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cop | Coptic | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cor | Cornish | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cot | Caquinte | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpa | Palantla Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpb | Ucayali-Yurรบa Ashรฉninka | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpc | Ajyรญninka Apurucayali | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpu | Pichis Ashรฉninka | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpy | South Ucayali Ashรฉninka | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| crh | Crimean Tatar | Turkic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| crn | El Nayar Cora | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| crx | Carrier | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| csb | Kashubian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cso | Sochiapam Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| csy | Siyin Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cta | Tataltepec Chatino | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cth | Thaiphum Chin | Bookkeeping | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ctp | Western Highland Chatino | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ctu | Chol | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cub | Cubeo | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cuc | Usila Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cui | Cuiba | Guahiboan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cuk | San Blas Kuna | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cut | Teutila Cuicatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cux | Tepeuxila Cuicatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cwe | Kwere | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cya | Nopala Chatino | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cym | Welsh | Indo-European | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| daa | Dangalรฉat | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dad | Marik | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dah | Gwahatike | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dan | Danish | Indo-European | 0 | 2 | 0 | 5 | 9 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 5 | 0 | 0 | 0 | 0 | 0 | 25 | +| ded | Dedua | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| deu | German | Indo-European | 0 | 2 | 0 | 6 | 14 | 7 | 0 | 0 | 0 | 0 | 0 | 1 | 7 | 2 | 18 | 4 | 0 | 0 | 2 | 0 | 63 | +| dgc | Casiguran Dumagat Agta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dgr | Dogrib | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dgz | Daga | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dhg | Dhangu-Djangu | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dif | Dieri | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dik | Southwestern Dinka | Nilotic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| div | Dhivehi | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dji | Djinang | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| djk | Eastern Maroon Creole | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| djr | Djambarrpuyngu | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dob | Dobu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| doi | Dogri (macrolanguage) | Unclassified | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| dop | Lukpa | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dov | Dombe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dsb | Lower Sorbian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dtp | Kadazan Dusun | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dwr | Dawro | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dww | Dawawa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dwy | Dhuwaya | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dyu | Dyula | Mande | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| dza | Tunzu | Atlantic-Congo | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dzo | Dzongkha | Sino-Tibetan | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ebk | Eastern Bontok | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| eko | Koti | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ell | Modern Greek (1453-) | Indo-European | 0 | 2 | 0 | 3 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 18 | +| emi | Mussau-Emira | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| emp | Northern Emberรก | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| eng | English | Indo-European | 9 | 62 | 4 | 17 | 160 | 18 | 21 | 5 | 1 | 6 | 3 | 1 | 13 | 8 | 108 | 13 | 2 | 1 | 7 | 24 | 483 | +| enq | Enga | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| epo | Esperanto | Artificial Language | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| eri | Ogea | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ese | Ese Ejja | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| esk | Northwest Alaska Inupiatun | Eskimo-Aleut | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| est | Estonian | Uralic | 0 | 1 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 9 | +| etr | Edolo | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| eus | Basque | Unclassified | 0 | 0 | 0 | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 | +| ewe | Ewe | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| faa | Fasu | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fai | Faiwol | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fao | Faroese | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 7 | +| far | Fataleka | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fas | Persian | Indo-European | 0 | 1 | 0 | 4 | 28 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 2 | 40 | 3 | 0 | 0 | 0 | 0 | 91 | +| ffm | Maasina Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fij | Fijian | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| fil | Filipino | Austronesian | 0 | 1 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| fin | Finnish | Uralic | 0 | 1 | 0 | 3 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 2 | 5 | 1 | 0 | 0 | 0 | 0 | 20 | +| fon | Fon | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| for | Fore | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fra | French | Indo-European | 0 | 1 | 0 | 7 | 13 | 8 | 0 | 0 | 0 | 0 | 0 | 1 | 6 | 3 | 15 | 4 | 0 | 1 | 2 | 0 | 61 | +| fry | Western Frisian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fuc | Pulaar | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fue | Borgu Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fuf | Pular | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fuh | Western Niger Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fur | Friulian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| fuv | Nigerian Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | +| gah | Alekano | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gai | Borei | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gam | Kandawo | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gaw | Nobonob | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gaz | West Central Oromo | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | +| gbm | Garhwali | Indo-European | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| gdn | Umanakaina | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gdr | Wipi | Eastern Trans-Fly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| geb | Kire | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gfk | Patpatar | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ghs | Guhu-Samane | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gla | Scottish Gaelic | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| gle | Irish | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| glg | Galician | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| glk | Gilaki | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| glv | Manx | Indo-European | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gmv | Gamo | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gng | Ngangam | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gnn | Gumatj | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gnw | Western Bolivian Guaranรญ | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gof | Gofa | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gom | Goan Konkani | Indo-European | 0 | 0 | 0 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| grc | Ancient Greek (to 1453) | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| grn | Guarani | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | +| gsw | Swiss German | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gub | Guajajรกra | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| guh | Guahibo | Guahiboan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gui | Eastern Bolivian Guaranรญ | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| guj | Gujarati | Indo-European | 0 | 0 | 0 | 6 | 6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 18 | +| gul | Sea Island Creole English | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gum | Guambiano | Barbacoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gun | Mbyรก Guaranรญ | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| guo | Guayabero | Guahiboan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gup | Gunwinggu | Gunwinyguan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gux | Gourmanchรฉma | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gvc | Guanano | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gvf | Golin | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gvn | Kuku-Yalanji | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gvs | Gumawana | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gwi | Gwichสผin | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gym | Ngรคbere | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gyr | Guarayu | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hat | Haitian | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 5 | +| hau | Hausa | Afro-Asiatic | 0 | 0 | 0 | 4 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 14 | +| haw | Hawaiian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hbo | Ancient Hebrew | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hch | Huichol | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| heb | Hebrew | Afro-Asiatic | 0 | 1 | 0 | 4 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 12 | +| heg | Helong | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hin | Hindi | Indo-European | 0 | 1 | 0 | 9 | 12 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 10 | 2 | 0 | 0 | 0 | 0 | 40 | +| hix | Hixkaryรกna | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hla | Halia | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hlt | Matu Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hmn | Hmong | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hmo | Hiri Motu | Pidgin | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hne | Chhattisgarhi | Indo-European | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| hns | Caribbean Hindustani | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hop | Hopi | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hot | Hote | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hrv | Croatian | Indo-European | 0 | 1 | 0 | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 11 | +| hsb | Upper Sorbian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hto | Minica Huitoto | Huitotoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hub | Huambisa | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hui | Huli | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hun | Hungarian | Uralic | 0 | 1 | 0 | 5 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 13 | +| hus | Huastec | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| huu | Murui Huitoto | Huitotoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| huv | San Mateo Del Mar Huave | Huavean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hvn | Sabu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hye | Armenian | Indo-European | 0 | 0 | 0 | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 9 | +| ian | Iatmul | Ndu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ibo | Igbo | Atlantic-Congo | 0 | 0 | 0 | 3 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 12 | +| ido | Ido | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ign | Ignaciano | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ikk | Ika | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ikw | Ikwere | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ile | Interlingue | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ilo | Iloko | Austronesian | 0 | 0 | 0 | 2 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 | +| imo | Imbongu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ina | Interlingua (International Auxiliary Language Association) | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| inb | Inga | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ind | Indonesian | Austronesian | 0 | 3 | 0 | 6 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 4 | 1 | 0 | 0 | 0 | 0 | 24 | +| ino | Inoke-Yate | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| iou | Tuma-Irumu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ipi | Ipili | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| isl | Icelandic | Indo-European | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 9 | +| isn | Isanzu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ita | Italian | Indo-European | 0 | 1 | 0 | 5 | 9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | 5 | 3 | 0 | 0 | 2 | 0 | 30 | +| iws | Sepik Iwam | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ixl | Ixil | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jac | Popti' | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jae | Yabem | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jao | Yanyuwa | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jav | Javanese | Austronesian | 0 | 0 | 0 | 4 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 13 | +| jic | Tol | Jicaquean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jid | Bu (Kaduna State) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jiv | Shuar | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jni | Janji | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jpn | Japanese | Japonic | 0 | 3 | 0 | 5 | 8 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 13 | 2 | 0 | 0 | 0 | 0 | 39 | +| jvn | Caribbean Javanese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kab | Kabyle | Afro-Asiatic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| kac | Kachin | Sino-Tibetan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | +| kam | Kamba (Kenya) | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kan | Kannada | Dravidian | 0 | 0 | 0 | 6 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 19 | +| kaq | Capanahua | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kas | Kashmiri | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| kat | Georgian | Kartvelian | 0 | 0 | 0 | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 10 | +| kaz | Kazakh | Turkic | 0 | 0 | 0 | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 | +| kbc | Kadiwรฉu | Guaicuruan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kbh | Camsรก | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kbm | Iwal | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kbp | Kabiyรจ | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kbq | Kamano | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kdc | Kutu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kde | Makonde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kdl | Tsikimba | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kea | Kabuverdianu | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | +| kek | Kekchรญ | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ken | Kenyang | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kew | West Kewa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kfg | Kudiya | Dravidian | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kfy | Kumaoni | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kgf | Kube | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kgk | Kaiwรก | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kgp | Kaingang | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| khk | Halh Mongolian | Mongolic-Khitan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | +| khm | Khmer | Austroasiatic | 0 | 0 | 0 | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 | +| khs | Kasua | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| khz | Keapara | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kik | Kikuyu | Atlantic-Congo | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| kin | Kinyarwanda | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 8 | +| kir | Kirghiz | Turkic | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 7 | +| kiw | Northeast Kiwai | Kiwaian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kiz | Kisi | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kje | Kisar | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kjs | East Kewa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kkc | Odoodee | East Strickland | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kkl | Kosarek Yale | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| klt | Nukna | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| klv | Maskelynes | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmb | Kimbundu | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kmg | Kรขte | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmh | Kalam | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmk | Limos Kalinga | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmo | Kwoma | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmr | Northern Kurdish | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| kms | Kamasau | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmu | Kanite | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| knc | Central Kanuri | Saharan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kne | Kankanaey | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| knf | Mankanya | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| knj | Western Kanjobal | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| knv | Tabo | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kon | Kongo | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kor | Korean | Koreanic | 0 | 2 | 0 | 4 | 8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 1 | 9 | 3 | 0 | 0 | 1 | 0 | 33 | +| kos | Kosraean | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpf | Komba | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpg | Kapingamarangi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpj | Karajรก | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpr | Korafe-Yegha | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpw | Kobon | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpx | Mountain Koiali | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kqa | Mum | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kqc | Doromu-Koki | Manubaran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kqf | Kakabai | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kql | Kyenele | Yuat | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kqw | Kandas | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| krc | Karachay-Balkar | Turkic | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ksd | Kuanua | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ksj | Uare | Kwalean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ksr | Borong | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ktm | Kurti | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kto | Kuot | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kud | 'Auhelawa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kue | Kuman (Papua New Guinea) | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kup | Kunimaipa | Kunimaipan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kur | Kurdish | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kvg | Kuni-Boazi | Anim | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kvn | Border Kuna | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kwd | Kwaio | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kwf | Kwara'ae | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kwi | Awa-Cuaiquer | Barbacoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kwj | Kwanga | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyc | Kyaka | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyf | Kouya | Kru | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyg | Keyagana | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyq | Kenga | Central Sudanic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyz | Kayabรญ | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kze | Kosena | Bookkeeping | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kzj | Coastal Kadazan | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lac | Lacandon | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lao | Lao | Tai-Kadai | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 5 | +| lat | Latin | Indo-European | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| lav | Latvian | Indo-European | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| lbb | Label | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lbk | Central Bontok | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lcm | Tungag | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| leu | Kara (Papua New Guinea) | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lex | Luang | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lfn | Lingua Franca Nova | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lgl | Wala | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lid | Nyindrou | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lif | Limbu | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lij | Ligurian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| lim | Limburgan | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| lin | Lingala | Atlantic-Congo | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 | +| lit | Lithuanian | Indo-European | 0 | 0 | 0 | 4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 | +| llg | Lole | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lmo | Lombard | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| ltg | Latgalian | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| ltz | Luxembourgish | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| lua | Luba-Lulua | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| lug | Ganda | Atlantic-Congo | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 | +| luo | Luo (Kenya and Tanzania) | Nilotic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 5 | +| lus | Lushai | Sino-Tibetan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| lvs | Standard Latvian | Unclassified | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 5 | +| lww | Lewo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| maa | San Jerรณnimo Tecรณatl Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mad | Madurese | Austronesian | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| mag | Magahi | Indo-European | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| mai | Maithili | Indo-European | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| maj | Jalapa De Dรญaz Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mak | Makasar | Austronesian | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| mal | Malayalam | Dravidian | 0 | 0 | 0 | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 19 | +| mam | Mam | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| maq | Chiquihuitlรกn Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mar | Marathi | Indo-European | 0 | 0 | 0 | 7 | 6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 20 | +| mau | Huautla Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mav | Saterรฉ-Mawรฉ | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| max | North Moluccan Malay | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| maz | Central Mazahua | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbb | Western Bukidnon Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbc | Macushi | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbh | Mangseng | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbj | Nadรซb | Naduhup | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbl | Maxakalรญ | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbs | Sarangani Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbt | Matigsalug Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mca | Maca | Mataguayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcb | Machiguenga | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcd | Sharanahua | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcf | Matsรฉs | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mco | Coatlรกn Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcp | Makaa | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcq | Ese | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcr | Menya | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mdy | Male (Ethiopia) | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| med | Melpa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mee | Mengen | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mek | Mekeo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| meq | Merey | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| met | Mato | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| meu | Motu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mey | Hassaniyya | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mgc | Morokodo | Central Sudanic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mgh | Makhuwa-Meetto | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mgw | Matumbi | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mhl | Mauwake | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mhr | Eastern Mari | Uralic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mib | Atatlรกhuca Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mic | Mi'kmaq | Algic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mie | Ocotepec Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mig | San Miguel El Grande Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mih | Chayuco Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mil | Peรฑoles Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| min | Minangkabau | Austronesian | 0 | 0 | 0 | 3 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | +| mio | Pinotepa Nacional Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mir | Isthmus Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mit | Southern Puebla Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| miz | Coatzospan Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mjc | San Juan Colorado Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mkd | Macedonian | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 7 | +| mkj | Mokilese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mkl | Mokole | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mkn | Kupang Malay | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mks | Silacayoapan Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mle | Manambu | Ndu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mlg | Malagasy | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mlh | Mape | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mlp | Bargam | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mlt | Maltese | Afro-Asiatic | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 9 | +| mmo | Mangga Buang | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mmx | Madak | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mna | Mbula | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mni | Manipuri | Sino-Tibetan | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| mon | Mongolian | Unclassified | 0 | 0 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| mop | Mopรกn Maya | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mos | Mossi | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| mox | Molima | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mph | Maung | Iwaidjan Proper | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpj | Martu Wangka | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpm | Yosondรบa Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpp | Migabac | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mps | Dadibi | Teberan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpt | Mian | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpx | Misima-Panaeati | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mqb | Mbuko | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mqj | Mamasa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mri | Maori | Austronesian | 0 | 1 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 | +| msa | Malay (macrolanguage) | Unclassified | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| msb | Masbatenyo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| msc | Sankaran Maninka | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| msk | Mansaka | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| msm | Agusan Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| msy | Aruamu | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mti | Maiwa (Papua New Guinea) | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mto | Totontepec Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mui | Musi | Austronesian | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| mup | Malvi | Indo-European | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| mux | Bo-Ung | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| muy | Muyang | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mva | Manam | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mvn | Minaveha | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwc | Are | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwe | Mwera (Chimwera) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwf | Murrinh-Patha | Southern Daly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwp | Kala Lagaw Ya | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwr | Marwari | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mxb | Tezoatlรกn Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mxp | Tlahuitoltepec Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mxq | Juquila Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mxt | Jamiltepec Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mya | Burmese | Sino-Tibetan | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 9 | +| myk | Mamara Senoufo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| myu | Mundurukรบ | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| myw | Muyuw | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| myy | Macuna | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mzz | Maiadomu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nab | Southern Nambikuรกra | Nambiquaran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| naf | Nabak | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nak | Nakanai | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nas | Naasioi | South Bougainville | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nbl | South Ndebele | Unclassified | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nbq | Nggem | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nca | Iyo | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nch | Central Huasteca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ncj | Northern Puebla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ncl | Michoacรกn Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ncu | Chumburung | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nde | North Ndebele | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ndg | Ndengereko | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ndj | Ndamba | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nds | Low German | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nep | Nepali (macrolanguage) | Unclassified | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| nfa | Dhao | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ngp | Ngulu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ngu | Guerrero Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhe | Eastern Huasteca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhg | Tetelcingo Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhi | Zacatlรกn-Ahuacatlรกn-Tepetzintla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nho | Takuu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhr | Naro | Khoe-Kwadi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhu | Noone | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhw | Western Huasteca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhy | Northern Oaxaca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nif | Nek | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nii | Nii | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nij | Ngaju | Austronesian | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| nin | Ninzo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nko | Nkonya | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nld | Dutch | Indo-European | 0 | 1 | 0 | 6 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 0 | 0 | 2 | 0 | 23 | +| nlg | Gela | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nna | Nyangumarta | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nno | Norwegian Nynorsk | Unclassified | 0 | 0 | 0 | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| nnq | Ngindo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| noa | Woun Meu | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nob | Norwegian Bokmรฅl | Unclassified | 0 | 0 | 0 | 4 | 7 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 19 | +| noe | Nimadi | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nop | Numanggang | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nor | Norwegian | Indo-European | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | +| not | Nomatsiguenga | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nou | Ewage-Notu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nov | Novial | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| npi | Nepali (individual language) | Indo-European | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 | +| npl | Southeastern Puebla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nqo | N'Ko | Artificial Language | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| nsn | Nehan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nso | Pedi | Atlantic-Congo | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 | +| nss | Nali | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ntj | Ngaanyatjarra | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ntp | Northern Tepehuan | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ntu | Natรผgu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nus | Nuer | Nilotic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| nuy | Nunggubuyu | Gunwinyguan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nvm | Namiae | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nwi | Southwest Tanna | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nya | Nyanja | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 | +| nys | Nyungar | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nyu | Nyungwe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| obo | Obo Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| oci | Occitan (post 1500) | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| okv | Orokaiva | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| omw | South Tairora | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ong | Olo | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ons | Ono | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ood | Tohono O'odham | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| opm | Oksapmin | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ori | Oriya (macrolanguage) | Unclassified | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| orm | Oromo | Unclassified | 0 | 0 | 0 | 1 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| orv | Old Russian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ory | Odia | Indo-European | 0 | 0 | 0 | 5 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 15 | +| ote | Mezquital Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| otm | Eastern Highland Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| otn | Tenango Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| otq | Querรฉtaro Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ots | Estado de Mรฉxico Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pab | Parecรญs | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pad | Paumarรญ | Arawan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pag | Pangasinan | Austronesian | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| pah | Tenharim | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pam | Pampanga | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pan | Panjabi | Indo-European | 0 | 0 | 0 | 6 | 6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 18 | +| pao | Northern Paiute | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pap | Papiamento | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| pbt | Southern Pashto | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | +| pcm | Nigerian Pidgin | Indo-European | 0 | 0 | 0 | 1 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| pes | Iranian Persian | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 | +| pib | Yine | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pio | Piapoco | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pir | Piratapuyo | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| piu | Pintupi-Luritja | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pjt | Pitjantjatjara | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pls | San Marcos Tlacoyalco Popoloca | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| plt | Plateau Malagasy | Austronesian | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | +| plu | Palikรบr | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pma | Paama | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pms | Piemontese | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| poe | San Juan Atzingo Popoloca | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| poh | Poqomchi' | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| poi | Highland Popoluca | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pol | Polish | Indo-European | 0 | 1 | 0 | 4 | 11 | 4 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | 0 | 18 | 4 | 0 | 0 | 1 | 0 | 48 | +| pon | Pohnpeian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| por | Portuguese | Indo-European | 0 | 1 | 0 | 4 | 9 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 1 | 5 | 3 | 0 | 0 | 1 | 0 | 30 | +| poy | Pogolo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ppo | Folopa | Teberan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| prf | Paranan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pri | Paicรฎ | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| prs | Dari | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ptp | Patep | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ptu | Bambam | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pus | Pushto | Unclassified | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| pwg | Gapapaiwa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qub | Huallaga Huรกnuco Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| quc | K'iche' | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| quf | Lambayeque Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| quh | South Bolivian Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qul | North Bolivian Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qup | Southern Pastaza Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| quy | Ayacucho Quechua | Quechuan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| quz | Cusco Quechua | Quechuan | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvc | Cajamarca Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qve | Eastern Apurรญmac Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvh | Huamalรญes-Dos de Mayo Huรกnuco Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvm | Margos-Yarowilca-Lauricocha Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvn | North Junรญn Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvs | San Martรญn Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvw | Huaylla Wanca Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvz | Northern Pastaza Quichua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qwh | Huaylas Ancash Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qxh | Panao Huรกnuco Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qxn | Northern Conchucos Ancash Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qxo | Southern Conchucos Ancash Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rai | Ramoaaina | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| raj | Rajasthani | Unclassified | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| reg | Kara (Tanzania) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rej | Rejang | Austronesian | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| rgu | Ringgou | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rkb | Rikbaktsa | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rmc | Carpathian Romani | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rmy | Vlax Romani | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rom | Romany | Unclassified | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| ron | Romanian | Indo-European | 0 | 1 | 0 | 5 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 3 | 1 | 0 | 0 | 0 | 0 | 19 | +| roo | Rotokas | North Bougainville | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rop | Kriol | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| row | Dela-Oenale | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rro | Waima | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ruf | Luguru | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rug | Roviana | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| run | Rundi | Atlantic-Congo | 0 | 0 | 0 | 1 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| rus | Russian | Indo-European | 0 | 2 | 0 | 5 | 13 | 6 | 0 | 0 | 0 | 0 | 0 | 2 | 4 | 2 | 16 | 4 | 0 | 0 | 1 | 0 | 55 | +| rwo | Rawa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sab | Buglere | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sag | Sango | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| sah | Yakut | Turkic | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| san | Sanskrit | Indo-European | 0 | 0 | 0 | 5 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| sat | Santali | Austroasiatic | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| sbe | Saliba | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sbk | Safwa | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sbs | Subiya | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| scn | Sicilian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| sco | Scots | Indo-European | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| seh | Sena | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sey | Secoya | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sgb | Mag-antsi Ayta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sgz | Sursurunga | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| shi | Tachelhit | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| shj | Shatt | Dajuic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| shn | Shan | Tai-Kadai | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | +| shp | Shipibo-Conibo | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sim | Mende (Papua New Guinea) | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sin | Sinhala | Indo-European | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 7 | +| sja | Epena | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| slk | Slovak | Indo-European | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 12 | +| sll | Salt-Yui | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| slv | Slovenian | Indo-European | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 10 | +| smk | Bolinao | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| smo | Samoan | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| sna | Shona | Atlantic-Congo | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 | +| snc | Sinaugoro | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| snd | Sindhi | Indo-European | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 | +| snn | Siona | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| snp | Siane | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| snx | Sam | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sny | Saniyo-Hiyewe | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| som | Somali | Afro-Asiatic | 0 | 0 | 0 | 3 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 9 | +| soq | Kanasi | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sot | Southern Sotho | Atlantic-Congo | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 5 | +| soy | Miyobe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| spa | Spanish | Indo-European | 0 | 2 | 0 | 4 | 13 | 4 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 2 | 13 | 4 | 0 | 0 | 2 | 0 | 48 | +| spl | Selepet | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| spm | Akukem | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| spp | Supyire Senoufo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sps | Saposa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| spy | Sabaot | Nilotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sqi | Albanian | Unclassified | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| srd | Sardinian | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| sri | Siriano | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| srm | Saramaccan | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| srn | Sranan Tongo | Indo-European | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| srp | Serbian | Indo-European | 0 | 0 | 0 | 4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 9 | +| srq | Sirionรณ | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ssd | Siroi | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ssg | Seimat | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ssw | Swati | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 7 | +| ssx | Samberigi | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| stp | Southeastern Tepehuan | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sua | Sulka | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sue | Suena | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sun | Sundanese | Austronesian | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 9 | +| sus | Susu | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| suz | Sunwar | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| svk | Slovakian Sign Language | Sign Language | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| swa | Swahili (macrolanguage) | Atlantic-Congo | 0 | 1 | 0 | 1 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 16 | +| swe | Swedish | Indo-European | 0 | 1 | 0 | 4 | 8 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 4 | 0 | 0 | 0 | 0 | 0 | 23 | +| swg | Swabian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| swh | Swahili (individual language) | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 | +| swp | Suau | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sxb | Suba | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| szl | Silesian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| tac | Lowland Tarahumara | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tah | Tahitian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| taj | Eastern Tamang | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tam | Tamil | Dravidian | 0 | 0 | 0 | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 3 | 1 | 0 | 0 | 0 | 0 | 21 | +| taq | Tamasheq | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| tat | Tatar | Turkic | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| tav | Tatuyo | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| taw | Tai | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbc | Takia | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbf | Mandara | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbg | North Tairora | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbo | Tawala | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbz | Ditammari | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tca | Ticuna | Ticuna-Yuri | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tcs | Torres Strait Creole | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tcz | Thado Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tdt | Tetun Dili | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tee | Huehuetla Tepehua | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tel | Telugu | Dravidian | 0 | 1 | 0 | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | 2 | 0 | 0 | 0 | 0 | 25 | +| ter | Tereno | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tet | Tetum | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tew | Tewa (USA) | Kiowa-Tanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tfr | Teribe | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tgk | Tajik | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 | +| tgl | Tagalog | Austronesian | 0 | 0 | 0 | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 | +| tgo | Sudest | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tgp | Tangoa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tha | Thai | Tai-Kadai | 0 | 1 | 0 | 4 | 8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 6 | 0 | 0 | 0 | 0 | 0 | 22 | +| tif | Tifal | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tim | Timbe | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tir | Tigrinya | Afro-Asiatic | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 | +| tiw | Tiwi | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tiy | Tiruray | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tke | Takwane | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tku | Upper Necaxa Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tlf | Telefol | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tmd | Haruai | Piawi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tna | Tacana | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tnc | Tanimuca-Retuarรฃ | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tnk | Kwamera | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tnn | North Tanna | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tnp | Whitesands | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| toc | Coyutla Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tod | Toma | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tof | Gizrra | Eastern Trans-Fly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| toj | Tojolabal | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ton | Tonga (Tonga Islands) | Austronesian | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| too | Xicotepec De Juรกrez Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| top | Papantla Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tos | Highland Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tpa | Taupota | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tpi | Tok Pisin | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| tpt | Tlachichilco Tepehua | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tpz | Tinputz | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| trc | Copala Triqui | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tsn | Tswana | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 7 | +| tso | Tsonga | Atlantic-Congo | 0 | 0 | 0 | 1 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 7 | +| tsw | Tsishingini | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ttc | Tektiteko | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tte | Bwanabwana | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tuc | Mutu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tue | Tuyuca | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tuf | Central Tunebo | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tuk | Turkmen | Turkic | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| tum | Tumbuka | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| tuo | Tucano | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tur | Turkish | Turkic | 0 | 3 | 0 | 4 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 3 | 2 | 0 | 0 | 1 | 0 | 24 | +| tvk | Southeast Ambrym | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| twi | Twi | Unclassified | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| txq | Tii | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| txu | Kayapรณ | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tyv | Tuvinian | Turkic | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tzj | Tz'utujil | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tzl | Talossan | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tzm | Central Atlas Tamazight | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| tzo | Tzotzil | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ubr | Ubir | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ubu | Umbu-Ungu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| udu | Uduk | Koman | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| uig | Uighur | Turkic | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| ukr | Ukrainian | Indo-European | 0 | 1 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 9 | +| uli | Ulithian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ulk | Meriam Mir | Eastern Trans-Fly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| umb | Umbundu | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| upv | Uripiv-Wala-Rano-Atchin | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ura | Urarina | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| urb | Urubรบ-Kaapor | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| urd | Urdu | Indo-European | 0 | 0 | 0 | 7 | 8 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 19 | +| uri | Urim | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| urt | Urat | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| urw | Sop | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| usa | Usarufa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| usp | Uspanteco | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| uvh | Uri | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| uvl | Lote | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| uzb | Uzbek | Unclassified | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| uzn | Northern Uzbek | Turkic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | +| vec | Venetian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| ven | Venda | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| vid | Vidunda | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| vie | Vietnamese | Austroasiatic | 0 | 2 | 0 | 5 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 5 | 0 | 0 | 0 | 0 | 0 | 20 | +| viv | Iduna | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| vmy | Ayautla Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| waj | Waffa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wal | Wolaytta | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wap | Wapishana | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| war | Waray (Philippines) | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 5 | +| wat | Kaninuwa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wbi | Vwanji | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wbp | Warlpiri | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wed | Wedau | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wer | Weri | Kunimaipan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wim | Wik-Mungkan | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wiu | Wiru | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wiv | Vitu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wln | Walloon | Indo-European | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wmt | Walmajarri | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wmw | Mwani | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wnc | Wantoat | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wnu | Usan | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wol | Wolof | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 | +| wos | Hanga Hundi | Ndu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wrk | Garrwa | Garrwan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wro | Worrorra | Worrorran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wrs | Waris | Border | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wsk | Waskia | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wuu | Wu Chinese | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wuv | Wuvulu-Aua | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xav | Xavรกnte | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xbi | Kombio | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xed | Hdi | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xho | Xhosa | Atlantic-Congo | 0 | 0 | 0 | 3 | 3 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 10 | +| xla | Kamula | Kamula-Elevala | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xnn | Northern Kankanay | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xon | Konkomba | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xsi | Sio | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xtd | Diuxi-Tilantongo Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xtm | Magdalena Peรฑasco Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yaa | Yaminahua | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yad | Yagua | Peba-Yagua | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yal | Yalunka | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yap | Yapese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yaq | Yaqui | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yby | Yaweyuha | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ycn | Yucuna | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ydd | Eastern Yiddish | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| yid | Yiddish | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yka | Yakan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yle | Yele | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yml | Iamalele | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yon | Yongkom | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yor | Yoruba | Atlantic-Congo | 0 | 0 | 0 | 4 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 16 | +| yrb | Yareba | Yareban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yre | Yaourรฉ | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yss | Yessan-Mayo | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yue | Yue Chinese | Sino-Tibetan | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| yuj | Karkar-Yuri | Pauwasi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yut | Yopno | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yuw | Yau (Morobe Province) | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yva | Yawa | Yawa-Saweru | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zaa | Sierra de Juรกrez Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zab | Western Tlacolula Valley Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zac | Ocotlรกn Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zad | Cajonos Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zai | Isthmus Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zaj | Zaramo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zam | Miahuatlรกn Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zao | Ozolotepec Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zap | Zapotec | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zar | Rincรณn Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zas | Santo Domingo Albarradas Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zat | Tabaa Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zav | Yatzachi Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zaw | Mitla Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zca | Coatecas Altas Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zga | Kinga | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zho | Chinese | Unclassified | 0 | 2 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 13 | 0 | 0 | 0 | 0 | 0 | 23 | +| zia | Zia | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ziw | Zigula | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zlm | Malay (individual language) | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zos | Francisco Leรณn Zoque | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpc | Choapan Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpl | Lachixรญo Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpm | Mixtepec Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpo | Amatlรกn Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpq | Zoogocho Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpu | Yalรกlag Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpv | Chichicapan Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpz | Texmelucan Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zsm | Standard Malay | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 5 | +| zsr | Southern Rincon Zapotec | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ztq | Quioquitani-Quierรญ Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zty | Yatee Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zul | Zulu | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 7 | +| zyp | Zyphe Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| Total | None | None | None | 9 | 114 | 4 | 1398 | 836 | 311 | 21 | 5 | 1 | 6 | 3 | 28 | 91 | 55 | 507 | 88 | 2 | 2 | 24 | 24 |
diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py index 1ec1ebc4fc..c0368dcc54 100644 --- a/mteb/abstasks/AbsTask.py +++ b/mteb/abstasks/AbsTask.py @@ -63,7 +63,7 @@ class AbsTask(ABC): dataset: dict[HFSubset, DatasetDict] | None = None # type: ignore data_loaded: bool = False is_multilingual: bool = False - hf_subsets: list[HFSubset] | None = None + hf_subsets: list[HFSubset] def __init__(self, seed: int = 42, **kwargs: Any): self.save_suffix = kwargs.get("save_suffix", "") @@ -73,6 +73,7 @@ def __init__(self, seed: int = 42, **kwargs: Any): np.random.seed(self.seed) torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) + self.hf_subsets = list(self.metadata.hf_subsets_to_langscripts.keys()) def check_if_dataset_is_superseded(self): """Check if the dataset is superseded by a newer version""" diff --git a/mteb/abstasks/AbsTaskBitextMining.py b/mteb/abstasks/AbsTaskBitextMining.py index 1c373cc2f7..b8105dc141 100644 --- a/mteb/abstasks/AbsTaskBitextMining.py +++ b/mteb/abstasks/AbsTaskBitextMining.py @@ -71,7 +71,7 @@ def evaluate( subsets_to_run: list[HFSubset] | None = None, *, encode_kwargs: dict[str, Any] = {}, - **kwargs, + **kwargs: Any, ) -> dict[HFSubset, ScoresDict]: if not self.data_loaded: self.load_data() diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index 67aa9fbccd..1bebc79930 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -82,6 +82,8 @@ "Web", "Written", "Programming", + "Chemistry", + "Financial", ] SAMPLE_CREATION_METHOD = Literal[ @@ -94,6 +96,7 @@ "machine-translated and localized", "LM-generated and verified", "rendered", + "multiple", ] TASK_TYPE = Literal[ @@ -119,6 +122,7 @@ "ZeroShotClassification", ] + TASK_CATEGORY = Literal[ "s2s", # Sentence-to-sentence "s2p", # Sentence-to-paragraph @@ -199,6 +203,8 @@ "gpl-3.0", "cdla-sharing-1.0", "mpl-2.0", + "msr-la-nc", + "multiple", ] ) @@ -258,7 +264,7 @@ class TaskMetadata(BaseModel): bibtex_citation: The BibTeX citation for the dataset. Should be an empty string if no citation is available. """ - dataset: dict + dataset: dict[str, Any] name: str description: str @@ -365,6 +371,15 @@ def _check_language_code(code): f"Invalid script code: {script}, you can find valid ISO 15924 codes in {path_to_lang_scripts}" ) + @property + def bcp47_codes(self) -> list[ISO_LANGUAGE_SCRIPT]: + """Return the languages and script codes of the dataset formatting in accordance with the BCP-47 standard.""" + if isinstance(self.eval_langs, dict): + return sorted( + {lang for langs in self.eval_langs.values() for lang in langs} + ) + return sorted(set(self.eval_langs)) + @property def languages(self) -> list[str]: """Return the languages of the dataset as iso639-3 codes.""" @@ -451,8 +466,12 @@ def n_samples(self) -> dict[str, int] | None: for subset, subset_value in stats.items(): if subset == "hf_subset_descriptive_stats": continue - n_samples[subset] = subset_value["num_samples"] + n_samples[subset] = subset_value["num_samples"] # type: ignore return n_samples def __hash__(self) -> int: return hash(self.model_dump_json()) + + @property + def revision(self) -> str: + return self.dataset["revision"] diff --git a/mteb/abstasks/aggregate_task_metadata.py b/mteb/abstasks/aggregate_task_metadata.py new file mode 100644 index 0000000000..106419b752 --- /dev/null +++ b/mteb/abstasks/aggregate_task_metadata.py @@ -0,0 +1,172 @@ +from __future__ import annotations + +import logging +from datetime import datetime +from typing import Any + +from pydantic import ConfigDict, model_validator + +from mteb.abstasks.AbsTask import AbsTask +from mteb.abstasks.TaskMetadata import ( + ANNOTATOR_TYPE, + LANGUAGES, + LICENSES, + MODALITIES, + SAMPLE_CREATION_METHOD, + STR_DATE, + TASK_DOMAIN, + TASK_SUBTYPE, + TASK_TYPE, + HFSubset, + TaskMetadata, +) +from mteb.languages import ISO_LANGUAGE_SCRIPT + +logger = logging.getLogger(__name__) + + +class AggregateTaskMetadata(TaskMetadata): + """Metadata for an aggregation of tasks. This description only covers exceptions to the TaskMetadata. Many of the field if not filled out will be + autofilled from its tasks. + + Attributes: + name: The name of the aggregated task. + description: A description of the task. Should explain the aggregation. + prompt: An aggregate task does not have a prompt, thus this value is always None. + dataset: The dataset for the aggregated task is specified in its tasks. The aggregate task thus only specified the revision and uses a + placeholder path. + tasks: A list of tasks, the majority of the metadata is described within its tasks. + eval_splits: The splits of the tasks used for evaluation. + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + name: str + description: str + dataset: dict[str, Any] = { + "path": "aggregate tasks do not have a path", # just a place holder + "revision": "1", + } + + tasks: list[AbsTask] + main_score: str + type: TASK_TYPE + eval_splits: list[str] + eval_langs: LANGUAGES = [] + prompt: None = None + reference: str | None = None + bibtex_citation: str | None = None + + @property + def hf_subsets_to_langscripts(self) -> dict[HFSubset, list[ISO_LANGUAGE_SCRIPT]]: + """Return a dictionary mapping huggingface subsets to languages.""" + return {"default": self.eval_langs} # type: ignore + + @model_validator(mode="after") # type: ignore + def compute_unfilled_cases(self) -> AggregateTaskMetadata: + if not self.eval_langs: + self.eval_langs = self.compute_eval_langs() + if not self.date: + self.date = self.compute_date() + if not self.domains: + self.domains = self.compute_domains() + if not self.task_subtypes: + self.task_subtypes = self.compute_task_subtypes() + if not self.license: + self.license = self.compute_license() + if not self.annotations_creators: + self.annotations_creators = self.compute_annotations_creators() + if not self.dialect: + self.dialect = self.compute_dialect() + if not self.sample_creation: + self.sample_creation = self.compute_sample_creation() + if not self.modalities: + self.modalities = self.compute_modalities() + + return self + + def compute_eval_langs(self) -> list[ISO_LANGUAGE_SCRIPT]: + langs = set() + for task in self.tasks: + langs.update(set(task.metadata.bcp47_codes)) + return list(langs) + + def compute_date(self) -> tuple[STR_DATE, STR_DATE] | None: + # get min max date from tasks + dates = [] + for task in self.tasks: + if task.metadata.date: + dates.append(datetime.fromisoformat(task.metadata.date[0])) + dates.append(datetime.fromisoformat(task.metadata.date[1])) + + if not dates: + return None + + min_date = min(dates) + max_date = max(dates) + return min_date.isoformat(), max_date.isoformat() + + def compute_domains(self) -> list[TASK_DOMAIN] | None: + domains = set() + for task in self.tasks: + if task.metadata.domains: + domains.update(set(task.metadata.domains)) + if domains: + return list(domains) + return None + + def compute_task_subtypes(self) -> list[TASK_SUBTYPE] | None: + subtypes = set() + for task in self.tasks: + if task.metadata.task_subtypes: + subtypes.update(set(task.metadata.task_subtypes)) + if subtypes: + return list(subtypes) + return None + + def compute_license(self) -> LICENSES | None: + licenses = set() + for task in self.tasks: + if task.metadata.license: + licenses.add(task.metadata.license) + if len(licenses) > 1: + return "multiple" + return None + + def compute_annotations_creators(self) -> ANNOTATOR_TYPE | None: + creators = set() + for task in self.tasks: + if task.metadata.annotations_creators: + creators.add(task.metadata.annotations_creators) + if len(creators) > 1: + logger.warning( + f"Multiple annotations_creators found for tasks in {self.name}. Using None as annotations_creators." + ) + return None + + def compute_dialect(self) -> list[str] | None: + dialects = set() + for task in self.tasks: + if task.metadata.dialect: + dialects.update(set(task.metadata.dialect)) + if dialects: + return list(dialects) + return None + + def compute_sample_creation(self) -> SAMPLE_CREATION_METHOD | None: + sample_creations = set() + for task in self.tasks: + if task.metadata.sample_creation: + sample_creations.add(task.metadata.sample_creation) + if len(sample_creations) > 1: + return "multiple" + return None + + def compute_modalities(self) -> list[MODALITIES]: + modalities = set() + for task in self.tasks: + if task.metadata.modalities: + modalities.update(set(task.metadata.modalities)) + if modalities: + return list(modalities) + return None diff --git a/mteb/abstasks/aggregated_task.py b/mteb/abstasks/aggregated_task.py new file mode 100644 index 0000000000..255df2000f --- /dev/null +++ b/mteb/abstasks/aggregated_task.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any + +import numpy as np + +from mteb.abstasks.AbsTask import AbsTask +from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata + +if TYPE_CHECKING: + from datasets import Dataset, DatasetDict + + from mteb.abstasks.TaskMetadata import DescriptiveStatistics, HFSubset + from mteb.encoder_interface import Encoder + from mteb.load_results.task_results import TaskResult + + from .AbsTask import ScoresDict + +logger = logging.getLogger(__name__) + + +class AbsTaskAggregate(AbsTask): + metadata: AggregateTaskMetadata + superseded_by: None | str = None + hf_subset = "default" # since there is no subset we use the "default" naming scheme + _eval_splits: list[str] | None = None + + def __init__(self, **kwargs: Any): + self.tasks = self.metadata.tasks + self.taskname_to_task = {task.metadata.name: task for task in self.tasks} + + def task_results_to_scores( + self, task_results: list[TaskResult] + ) -> dict[str, dict[HFSubset, ScoresDict]]: + """The function that aggregated scores. Can be redefined to allow for custom aggregations.""" + scores = {} + for split in self.metadata.eval_splits: + main_scores = [] + for task_res in task_results: + main_scores.append( + task_res.get_score_fast( + languages=None, + splits=self.metadata.eval_splits, + ) + ) + main_score = np.mean(main_scores) + scores[split] = { + "default": { + self.metadata.main_score: main_score, + "main_score": main_score, + } + } + return scores + + def combine_task_results(self, task_results: list[TaskResult]) -> TaskResult: + """Combined the task results for using `task_results_to_scores`. Do not redefine this function if you want to implement a custom aggregation. + Instead redefin `task_results_to_scores`. + """ + from mteb.load_results.task_results import ( + TaskResult, # to prevent circular imports, # TODO: can potentially likely be out of function in in v2.0.0 + ) + + eval_times = [tr.evaluation_time for tr in task_results if tr.evaluation_time] + if len(eval_times) != len(task_results): + logger.info( + f"Loaded results does not include runtime. Therefor evaluation of {self.metadata.name} " + + "can't be computed. Setting it to None." + ) + eval_time = np.nan + else: + eval_time = sum(eval_times) + + kg_co2_emissions_ = [ + tr.kg_co2_emissions for tr in task_results if tr.kg_co2_emissions + ] + if len(kg_co2_emissions_) != len(task_results): + logger.info( + f"Loaded results does not include co2-eq emissions. Therefor evaluation of {self.metadata.name} " + + "can't be computed. Setting it to None." + ) + kg_co2_emissions = np.nan + else: + kg_co2_emissions = sum(kg_co2_emissions_) + + task_res = TaskResult.from_task_results( + self, + scores=self.task_results_to_scores(task_results), + evaluation_time=eval_time, + kg_co2_emissions=kg_co2_emissions, + ) + mteb_versions = {tr.mteb_version for tr in task_results} + if len(mteb_versions) != 1: + logger.warning( + f"All tasks of {self.metadata.name} is not run using the same version." + ) + task_res.mteb_version = None + task_res.mteb_version = task_results[0].mteb_version + return task_res + + def check_if_dataset_is_superseded(self): + """Check if the dataset is superseded by a newer version""" + if self.superseded_by: + logger.warning( + f"Dataset '{self.metadata.name}' is superseded by '{self.superseded_by}', you might consider using the newer version of the dataset." + ) + + def filter_eval_splits(self, eval_splits: list[str] | None) -> AbsTaskAggregate: + """Filter the evaluation splits of the task.""" + self._eval_splits = eval_splits + return self + + def evaluate( + self, + model: Encoder, + split: str = "test", + subsets_to_run: list[HFSubset] | None = None, + *, + encode_kwargs: dict[str, Any] = {}, + **kwargs: Any, + ) -> dict[HFSubset, ScoresDict]: + # TODO: If we refactor the runner to at least have a subfunction mteb.run_task(model, task) we could use that here + raise NotImplementedError( + "Aggregate tasks can't be evaluated directly. Instead run it using the MTEB class." + ) + + def _evaluate_subset( + self, + model: Encoder, + data_split: DatasetDict | Dataset, + encode_kwargs: dict[str, Any], + **kwargs: Any, + ) -> ScoresDict: + raise NotImplementedError( + "Aggregate tasks does not implement a _evaluate_subset. Instead use the individual tasks." + ) + + def _calculate_metrics_from_split( + self, split: str, hf_subset: str | None = None, compute_overall: bool = False + ) -> DescriptiveStatistics: + raise NotImplementedError( + "Aggregate tasks does not implement a _calculate_metrics_from_split. Instead use the individual tasks." + ) + + @property + def eval_splits(self) -> list[str]: + if self._eval_splits: + return self._eval_splits + return self.metadata.eval_splits diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 233c7a79b3..600981a77d 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -154,18 +154,7 @@ def load_results( "Banking77Classification", "BiorxivClusteringP2P", "BiorxivClusteringS2S", - "CQADupstackAndroidRetrieval", - "CQADupstackEnglishRetrieval", - "CQADupstackGamingRetrieval", - "CQADupstackGisRetrieval", - "CQADupstackMathematicaRetrieval", - "CQADupstackPhysicsRetrieval", - "CQADupstackProgrammersRetrieval", - "CQADupstackStatsRetrieval", - "CQADupstackTexRetrieval", - "CQADupstackUnixRetrieval", - "CQADupstackWebmastersRetrieval", - "CQADupstackWordpressRetrieval", + "CQADupstackRetrieval", "ClimateFEVER", "DBPedia", "EmotionClassification", @@ -432,13 +421,12 @@ def load_results( ), description="A curated selection of tasks coverering the Scandinavian languages; Danish, Swedish and Norwegian, including Bokmรฅl and Nynorsk.", reference="https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/", - citation="""@misc{enevoldsen2024scandinavian, - title={The Scandinavian Embedding Benchmarks: Comprehensive Assessment of Multilingual and Monolingual Text Embedding}, - author={Kenneth Enevoldsen and Mรกrton Kardos and Niklas Muennighoff and Kristoffer Laigaard Nielbo}, - year={2024}, - eprint={2406.02396}, - archivePrefix={arXiv}, - primaryClass={cs.CL} + citation="""@inproceedings{enevoldsen2024scandinavian, + title={The Scandinavian Embedding Benchmarks: Comprehensive Assessment of Multilingual and Monolingual Text Embedding}, + author={Enevoldsen, Kenneth and Kardos, M{\'a}rton and Muennighoff, Niklas and Nielbo, Kristoffer}, + booktitle={Advances in Neural Information Processing Systems}, + year={2024}, + url={https://nips.cc/virtual/2024/poster/97869} }""", contacts=["KennethEnevoldsen", "x-tabdeveloping", "Samoed"], ) @@ -1153,6 +1141,30 @@ def load_results( }""", ) + +CODE_RAG = Benchmark( + name="CodeRAG", + tasks=get_tasks( + tasks=[ + "CodeRAGLibraryDocumentationSolutions", + "CodeRAGOnlineTutorials", + "CodeRAGProgrammingSolutions", + "CodeRAGStackoverflowPosts", + ], + ), + description="A benchmark for evaluating code retrieval augmented generation, testing models' ability to retrieve relevant programming solutions, tutorials and documentation.", + reference="https://arxiv.org/abs/2406.14497", + citation="""@misc{wang2024coderagbenchretrievalaugmentcode, + title={CodeRAG-Bench: Can Retrieval Augment Code Generation?}, + author={Zora Zhiruo Wang and Akari Asai and Xinyan Velocity Yu and Frank F. Xu and Yiqing Xie and Graham Neubig and Daniel Fried}, + year={2024}, + eprint={2406.14497}, + archivePrefix={arXiv}, + primaryClass={cs.SE}, + url={https://arxiv.org/abs/2406.14497}, + }""", +) + NANOBEIR = Benchmark( name="NanoBEIR", tasks=get_tasks( @@ -1232,3 +1244,126 @@ def load_results( primaryClass={cs.CL} }""", ) + +FA_MTEB = Benchmark( + name="FaMTEB(fas, beta)", + tasks=get_tasks( + languages=["fas"], + tasks=[ + # Classification + "PersianFoodSentimentClassification", + "SynPerChatbotConvSAClassification", + "SynPerChatbotConvSAToneChatbotClassification", + "SynPerChatbotConvSAToneUserClassification", + "SynPerChatbotSatisfactionLevelClassification", + "SynPerChatbotRAGToneChatbotClassification", + "SynPerChatbotRAGToneUserClassification", + "SynPerChatbotToneChatbotClassification", + "SynPerChatbotToneUserClassification", + "PersianTextTone", + "SIDClassification", + "DeepSentiPers", + "PersianTextEmotion", + "SentimentDKSF", + "NLPTwitterAnalysisClassification", + "DigikalamagClassification", + "MassiveIntentClassification", + "MassiveScenarioClassification", + # Clustering + "BeytooteClustering", + "DigikalamagClustering", + "HamshahriClustring", + "NLPTwitterAnalysisClustering", + "SIDClustring", + # PairClassification + "FarsTail", + "CExaPPC", + "SynPerChatbotRAGFAQPC", + "FarsiParaphraseDetection", + "SynPerTextKeywordsPC", + "SynPerQAPC", + "ParsinluEntail", + "ParsinluQueryParaphPC", + # Reranking + "MIRACLReranking", + "WikipediaRerankingMultilingual", + # Retrieval + "SynPerQARetrieval", + "SynPerChatbotTopicsRetrieval", + "SynPerChatbotRAGTopicsRetrieval", + "SynPerChatbotRAGFAQRetrieval", + "PersianWebDocumentRetrieval", + "WikipediaRetrievalMultilingual", + "MIRACLRetrieval", + "ClimateFEVER-Fa", + "DBPedia-Fa", + "HotpotQA-Fa", + "MSMARCO-Fa", + "NQ-Fa", + "ArguAna-Fa", + "CQADupstackRetrieval-Fa", + "FiQA2018-Fa", + "NFCorpus-Fa", + "QuoraRetrieval-Fa", + "SCIDOCS-Fa", + "SciFact-Fa", + "TRECCOVID-Fa", + "Touche2020-Fa", + # STS + "Farsick", + "SynPerSTS", + "Query2Query", + # SummaryRetrieval + "SAMSumFa", + "SynPerChatbotSumSRetrieval", + "SynPerChatbotRAGSumSRetrieval", + ], + ), + description="Main Persian (Farsi) benchmarks from MTEB", + reference=None, + citation=None, + contacts=["mehran-sarmadi", "ERfun", "morteza20"], +) + +CHEMTEB = Benchmark( + name="ChemTEB", + tasks=get_tasks( + tasks=[ + "PubChemSMILESBitextMining", + "SDSEyeProtectionClassification", + "SDSGlovesClassification", + "WikipediaBioMetChemClassification", + "WikipediaGreenhouseEnantiopureClassification", + "WikipediaSolidStateColloidalClassification", + "WikipediaOrganicInorganicClassification", + "WikipediaCryobiologySeparationClassification", + "WikipediaChemistryTopicsClassification", + "WikipediaTheoreticalAppliedClassification", + "WikipediaChemFieldsClassification", + "WikipediaLuminescenceClassification", + "WikipediaIsotopesFissionClassification", + "WikipediaSaltsSemiconductorsClassification", + "WikipediaBiolumNeurochemClassification", + "WikipediaCrystallographyAnalyticalClassification", + "WikipediaCompChemSpectroscopyClassification", + "WikipediaChemEngSpecialtiesClassification", + "WikipediaChemistryTopicsClustering", + "WikipediaSpecialtiesInChemistryClustering", + "PubChemAISentenceParaphrasePC", + "PubChemSMILESPC", + "PubChemSynonymPC", + "PubChemWikiParagraphsPC", + "PubChemWikiPairClassification", + "ChemNQRetrieval", + "ChemHotpotQARetrieval", + ], + ), + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + citation="""@article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} +}""", +) diff --git a/mteb/create_meta.py b/mteb/create_meta.py index e810751a08..ea4bf9c952 100644 --- a/mteb/create_meta.py +++ b/mteb/create_meta.py @@ -8,7 +8,6 @@ import mteb from mteb import TaskResult -from mteb.load_results.task_results import CQADupstackRetrievalDummy def generate_readme(results_folder: Path, from_existing: Path | None = None) -> str: @@ -46,12 +45,7 @@ def load_model_name(results_folder: Path) -> str: def process_task_result(task_result: TaskResult) -> list[dict[str, Any]]: - # CQADupstackRetrieval is a combined dataset (special case atm.) - task = ( - CQADupstackRetrievalDummy() - if task_result.task_name == "CQADupstackRetrieval" - else mteb.get_task(task_result.task_name) - ) + task = mteb.get_task(task_result.task_name) yaml_results = [] for split, hf_subset_scores in task_result.scores.items(): diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index 70378931c2..bcef789174 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -4,7 +4,7 @@ import logging import os import traceback -from collections.abc import Iterable +from collections.abc import Iterable, Sequence from copy import copy, deepcopy from datetime import datetime from itertools import chain @@ -16,6 +16,7 @@ from sentence_transformers import CrossEncoder, SentenceTransformer from mteb.abstasks.AbsTask import ScoresDict +from mteb.abstasks.aggregated_task import AbsTaskAggregate from mteb.encoder_interface import Encoder from mteb.model_meta import ModelMeta from mteb.models import model_meta_from_sentence_transformers @@ -31,9 +32,12 @@ class MTEB: + _tasks: Iterable[str | AbsTask] | None + tasks: list[AbsTask] + def __init__( self, - tasks: Iterable[str | AbsTask] | None = None, + tasks: Sequence[str | AbsTask] | None = None, *, task_types: list[str] | None = None, task_categories: list[str] | None = None, @@ -61,12 +65,11 @@ def __init__( self.deprecation_warning( task_types, task_categories, task_langs, tasks, version ) - if tasks is not None: self._tasks = tasks if isinstance(tasks[0], Benchmark): self.benchmarks = tasks - self._tasks = list(chain.from_iterable(tasks)) + self._tasks = self._tasks = list(chain.from_iterable(tasks)) # type: ignore assert ( task_types is None and task_categories is None ), "Cannot specify both `tasks` and `task_types`/`task_categories`" @@ -253,7 +256,7 @@ def select_tasks(self, **kwargs): f"WARNING: Unknown tasks: {unknown_str}. Known tasks: {known_str}." ) # add task if subclass of mteb.tasks - self.tasks.extend([x for x in self._tasks if isinstance(x, AbsTask)]) + self.tasks.extend([x for x in self._tasks if isinstance(x, (AbsTask))]) return # Otherwise use filters to select tasks @@ -463,6 +466,29 @@ def run( f"\n\n********************** Evaluating {task.metadata.name} **********************" ) + if isinstance(task, AbsTaskAggregate): + self_ = MTEB(tasks=task.metadata.tasks) + task_results = self_.run( + model, + verbosity=verbosity - 1, + output_folder=output_folder, + eval_splits=eval_splits, + eval_subsets=eval_subsets, + overwrite_results=overwrite_results, + raise_error=raise_error, + co2_tracker=co2_tracker, + encode_kwargs=encode_kwargs, + **kwargs, + ) + new_results = task.combine_task_results(task_results) + evaluation_results.append(new_results) + + if output_path: + save_path = output_path / f"{task.metadata.name}.json" + new_results.to_disk(save_path) + del self.tasks[0] + continue + if "bm25s" in meta.name and task.metadata.type != "Retrieval": logger.warning( f"bm25s only supports Retrieval tasks, but the task type is {task.metadata.type}. Skipping task." @@ -473,7 +499,11 @@ def run( task_eval_splits = ( eval_splits if eval_splits is not None else task.eval_splits ) - task_subsets = list(task.metadata.hf_subsets_to_langscripts.keys()) + task_subsets = ( + task.hf_subsets + if task.hf_subsets + else list(task.metadata.hf_subsets_to_langscripts.keys()) + ) existing_results = None save_path = None diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 5ee5a6b9da..94bea19a83 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -5,6 +5,7 @@ import logging import tempfile import time +import typing from pathlib import Path from typing import Literal from urllib.parse import urlencode @@ -14,21 +15,41 @@ from gradio_rangeslider import RangeSlider import mteb +from mteb.abstasks.TaskMetadata import TASK_TYPE from mteb.caching import json_cache from mteb.leaderboard.figures import performance_size_plot, radar_chart from mteb.leaderboard.table import scores_to_tables logger = logging.getLogger(__name__) +acknowledgment_md = """ +### Acknowledgment +We thank [ServiceNow](https://www.servicenow.com/), [Contextual AI](https://contextual.ai/) and [Hugging Face](https://huggingface.co/) for their generous sponsorship. If you'd like to sponsor us, please get in [touch](mailto:n.muennighoff@gmail.com). + + + +We also thank the following companies which provide API credits to evaluate their models: [OpenAI](https://openai.com/), [Voyage AI](https://www.voyageai.com/) +""" + +ALL_MODELS = {meta.name for meta in mteb.get_model_metas()} + def load_results(): results_cache_path = Path(__file__).parent.joinpath("__cached_results.json") if not results_cache_path.exists(): - all_results = ( - mteb.load_results(only_main_score=True, require_model_meta=False) - .join_revisions() - .filter_models() - ) + all_results = mteb.load_results( + only_main_score=True, require_model_meta=False, models=ALL_MODELS + ).filter_models() all_results.to_disk(results_cache_path) return all_results else: @@ -168,7 +189,7 @@ def filter_models( benchmarks = mteb.get_benchmarks() all_benchmark_results = { - benchmark.name: benchmark.load_results(base_results=all_results) + benchmark.name: benchmark.load_results(base_results=all_results).join_revisions() for benchmark in benchmarks } default_benchmark = mteb.get_benchmark(DEFAULT_BENCHMARK_NAME) @@ -206,7 +227,7 @@ def filter_models( ) type_select = gr.Dropdown( all_results.task_types, - value=sorted(default_results.task_types), + value=sorted(typing.get_args(TASK_TYPE)), multiselect=True, label="Task Type", info="Select task types to include.", @@ -232,6 +253,12 @@ def filter_models( """ with gr.Blocks(fill_width=True, theme=gr.themes.Base(), head=head) as demo: + gr.Markdown(""" + ## MMTEB: Massive Multilingual Text Embedding Benchmark + + The MMTEB leaderboard compares text embedding models on 1000+ languages. Check out the [paper](https://openreview.net/pdf?id=zl3pfz4VCV) for details on datasets, languages and tasks. And you can contribute! ๐Ÿค— To add a model, please refer to the documentation in the [GitHub repository](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md). Also check out [MTEB Arena](https://huggingface.co/spaces/mteb/arena) โš”๏ธ + """) + with gr.Row(): with gr.Column(scale=5): gr.Markdown( @@ -632,6 +659,7 @@ def update_tables( outputs=[summary_table, per_task_table], ) + gr.Markdown(acknowledgment_md, elem_id="ack_markdown") if __name__ == "__main__": demo.launch() diff --git a/mteb/load_results/benchmark_results.py b/mteb/load_results/benchmark_results.py index e1632a3dec..caece1b2b4 100644 --- a/mteb/load_results/benchmark_results.py +++ b/mteb/load_results/benchmark_results.py @@ -3,7 +3,7 @@ import json import warnings from collections import defaultdict -from collections.abc import Iterable +from collections.abc import Iterable, Sequence from pathlib import Path from typing import Any, Callable, Literal @@ -69,7 +69,7 @@ def filter_tasks( task_results=new_task_results, ) - def select_tasks(self, tasks: list[AbsTask]) -> ModelResult: + def select_tasks(self, tasks: Sequence[AbsTask]) -> ModelResult: task_name_to_task = {task.metadata.name: task for task in tasks} new_task_results = [ task_res.validate_and_filter_scores(task_name_to_task[task_res.task_name]) @@ -105,15 +105,15 @@ def get_scores( try: if use_fast: scores[res.task_name] = res.get_score_fast( - splits=splits, - languages=languages, + splits=splits, # type: ignore + languages=languages, # type: ignore ) else: scores[res.task_name] = res.get_score( splits=splits, languages=languages, - aggregation=aggregation, - getter=getter, + aggregation=aggregation, # type: ignore + getter=getter, # type: ignore scripts=scripts, ) except Exception as e: @@ -216,7 +216,7 @@ def filter_tasks( model_results=[res for res in model_results if res.task_results] ) - def select_tasks(self, tasks: list[AbsTask]) -> BenchmarkResults: + def select_tasks(self, tasks: Sequence[AbsTask]) -> BenchmarkResults: new_model_results = [ model_res.select_tasks(tasks) for model_res in self.model_results ] @@ -259,6 +259,8 @@ def parse_version(version_str: str) -> Version | None: return None def keep_best(group: pd.DataFrame) -> pd.DataFrame: + # Filtering out task_results where no scores are present + group = group[group["has_scores"]] is_main_revision = group["revision"] == group["main_revision"] # If the main revision is present we select that if is_main_revision.sum() > 0: @@ -286,6 +288,7 @@ def keep_best(group: pd.DataFrame) -> pd.DataFrame: task_name=task_result.task_name, mteb_version=task_result.mteb_version, task_result=task_result, + has_scores=bool(task_result.scores), ) ) task_df = pd.DataFrame.from_records(records) @@ -314,8 +317,8 @@ def get_scores( splits: list[Split] | None = None, languages: list[ISO_LANGUAGE | ISO_LANGUAGE_SCRIPT] | None = None, scripts: list[ISO_LANGUAGE_SCRIPT] | None = None, - getter: Callable[[ScoresDict], Score] = None, - aggregation: Callable[[list[Score]], Any] = None, + getter: Callable[[ScoresDict], Score] | None = None, + aggregation: Callable[[list[Score]], Any] | None = None, format: Literal["wide", "long"] = "wide", ) -> list[dict]: entries = [] @@ -390,7 +393,7 @@ def to_dict(self) -> dict: return self.model_dump() @classmethod - def from_dict(cls, data: dict) -> TaskResult: + def from_dict(cls, data: dict) -> BenchmarkResults: return cls.model_validate(data) def to_disk(self, path: Path | str) -> None: diff --git a/mteb/load_results/task_results.py b/mteb/load_results/task_results.py index 72cae5a93d..4ff2406934 100644 --- a/mteb/load_results/task_results.py +++ b/mteb/load_results/task_results.py @@ -4,6 +4,7 @@ import logging from argparse import Namespace from collections import defaultdict +from collections.abc import Iterable from functools import cached_property from importlib.metadata import version from pathlib import Path @@ -23,24 +24,6 @@ logger = logging.getLogger(__name__) -# Tasks that were completely removed from the MTEB (we generally don't do this anymore instead we supersede tasks) -class CQADupstackRetrievalDummy: - """A dummy task for loading historic results from before v1.11.0""" - - metadata = Namespace( # type: ignore - name="CQADupstackRetrieval", - main_score="ndcg_at_10", - type="Retrieval", - hf_subsets_to_langscripts={ - "default": ["eng-Latn"], - }, - dataset={ - "revision": "revision not applicable", - "path": "CQADupstackRetrieval_is_a_combined_dataset", - }, - ) - - class ScalaNbClassificationDummy: """A dummy task for loading historic results from before v1.11.0""" @@ -52,6 +35,7 @@ class ScalaNbClassificationDummy: "default": ["nob-Latn"], }, dataset={"revision": "revision_not_applicable"}, + revision="revision_not_applicable", ) @@ -66,6 +50,7 @@ class ScalaNnClassificationDummy: "default": ["nno-Latn"], }, dataset={"revision": "revision_not_applicable"}, + revision="revision_not_applicable", ) @@ -80,6 +65,7 @@ class ScalaDaClassificationDummy: "default": ["dan-Latn"], }, dataset={"revision": "revision_not_applicable"}, + revision="revision_not_applicable", ) @@ -94,11 +80,11 @@ class ScalaSvClassificationDummy: "default": ["swe-Latn"], }, dataset={"revision": "revision_not_applicable"}, + revision="revision_not_applicable", ) outdated_tasks = { - "CQADupstackRetrieval": CQADupstackRetrievalDummy, "ScalaNbClassification": ScalaNbClassificationDummy, "ScalaNnClassification": ScalaNnClassificationDummy, "ScalaDaClassification": ScalaDaClassificationDummy, @@ -183,7 +169,7 @@ def from_task_results( flat_scores[split].append(_scores) return TaskResult( - dataset_revision=task.metadata.dataset["revision"], + dataset_revision=task.metadata.revision, task_name=task.metadata.name, mteb_version=version("mteb"), scores=flat_scores, @@ -471,10 +457,12 @@ def get_score( return aggregation(values) - def get_score_fast(self, splits: str | None, languages: str | None) -> float: + def get_score_fast( + self, splits: Iterable[str] | None = None, languages: str | None = None + ) -> float: """Sped up version of get_score that will be used if no aggregation, script or getter needs to be specified.""" if splits is None: - splits = self.scores + splits = self.scores.keys() val_sum = 0 n_val = 0 for split in splits: @@ -536,14 +524,11 @@ def validate_and_filter_scores(self, task: AbsTask | None = None) -> TaskResult: if task is None: task = get_task(self.task_name) + splits = task.metadata.eval_splits - if task.is_multilingual: - hf_subsets = getattr( - task, "hf_subsets", task.metadata.hf_subsets_to_langscripts.keys() - ) - hf_subsets = set(hf_subsets) - else: - hf_subsets = {"default"} + hf_subsets = task.hf_subsets + hf_subsets = set(hf_subsets) + new_scores = {} seen_splits = set() for split in self.scores: diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py index f765b01bff..e92c1ca098 100644 --- a/mteb/models/arctic_models.py +++ b/mteb/models/arctic_models.py @@ -110,7 +110,8 @@ # in MTEB "NQ": ["test"], "NQHardNegatives": ["test"], - "HotPotQA": ["test"], + "NQ-PL": ["test"], + "HotPotQA": ["test"], # translated, not trained on "HotPotQAHardNegatives": ["test"], "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) "FEVER": ["test"], diff --git a/mteb/models/bedrock_models.py b/mteb/models/bedrock_models.py new file mode 100644 index 0000000000..4616209df1 --- /dev/null +++ b/mteb/models/bedrock_models.py @@ -0,0 +1,264 @@ +from __future__ import annotations + +import json +import logging +import re +from functools import partial +from typing import Any + +import numpy as np +import tqdm + +from mteb.encoder_interface import PromptType +from mteb.model_meta import ModelMeta +from mteb.models.cohere_models import model_prompts as cohere_model_prompts +from mteb.models.cohere_models import supported_languages as cohere_supported_languages +from mteb.requires_package import requires_package + +from .wrapper import Wrapper + +logger = logging.getLogger(__name__) + + +class BedrockWrapper(Wrapper): + def __init__( + self, + model_id: str, + provider: str, + max_tokens: int, + model_prompts: dict[str, str] | None = None, + **kwargs, + ) -> None: + requires_package(self, "boto3", "The AWS SDK for Python") + import boto3 + + boto3_session = boto3.session.Session() + region_name = boto3_session.region_name + self._client = boto3.client("bedrock-runtime", region_name) + + self._model_id = model_id + self._provider = provider.lower() + + if self._provider == "cohere": + self.model_prompts = ( + self.validate_task_to_prompt_name(model_prompts) + if model_prompts + else None + ) + self._max_batch_size = 96 + self._max_sequence_length = max_tokens * 4 + else: + self._max_tokens = max_tokens + + def encode( + self, + sentences: list[str], + *, + task_name: str | None = None, + prompt_type: PromptType | None = None, + **kwargs: Any, + ) -> np.ndarray: + requires_package(self, "boto3", "Amazon Bedrock") + show_progress_bar = ( + False + if "show_progress_bar" not in kwargs + else kwargs.pop("show_progress_bar") + ) + if self._provider == "amazon": + return self._encode_amazon(sentences, show_progress_bar) + elif self._provider == "cohere": + prompt_name = self.get_prompt_name( + self.model_prompts, task_name, prompt_type + ) + cohere_task_type = self.model_prompts.get(prompt_name, "search_document") + return self._encode_cohere(sentences, cohere_task_type, show_progress_bar) + else: + raise ValueError( + f"Unknown provider '{self._provider}'. Must be 'amazon' or 'cohere'." + ) + + def _encode_amazon( + self, sentences: list[str], show_progress_bar: bool = False + ) -> np.ndarray: + from botocore.exceptions import ValidationError + + all_embeddings = [] + # https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html + max_sequence_length = int(self._max_tokens * 4.5) + + for sentence in tqdm.tqdm( + sentences, leave=False, disable=not show_progress_bar + ): + if len(sentence) > max_sequence_length: + truncated_sentence = sentence[:max_sequence_length] + else: + truncated_sentence = sentence + + try: + embedding = self._embed_amazon(truncated_sentence) + all_embeddings.append(embedding) + + except ValidationError as e: + error_str = str(e) + pattern = r"request input token count:\s*(\d+)" + match = re.search(pattern, error_str) + if match: + num_tokens = int(match.group(1)) + + ratio = 0.9 * (self._max_tokens / num_tokens) + dynamic_cutoff = int(len(truncated_sentence) * ratio) + + embedding = self._embed_amazon(truncated_sentence[:dynamic_cutoff]) + all_embeddings.append(embedding) + else: + raise e + + return np.array(all_embeddings) + + def _encode_cohere( + self, + sentences: list[str], + cohere_task_type: str, + show_progress_bar: bool = False, + ) -> np.ndarray: + batches = [ + sentences[i : i + self._max_batch_size] + for i in range(0, len(sentences), self._max_batch_size) + ] + + all_embeddings = [] + + for batch in tqdm.tqdm(batches, leave=False, disable=not show_progress_bar): + response = self._client.invoke_model( + body=json.dumps( + { + "texts": [sent[: self._max_sequence_length] for sent in batch], + "input_type": cohere_task_type, + } + ), + modelId=self._model_id, + accept="*/*", + contentType="application/json", + ) + all_embeddings.extend(self._to_numpy(response)) + + return np.array(all_embeddings) + + def _embed_amazon(self, sentence: str) -> np.ndarray: + response = self._client.invoke_model( + body=json.dumps({"inputText": sentence}), + modelId=self._model_id, + accept="application/json", + contentType="application/json", + ) + return self._to_numpy(response) + + def _to_numpy(self, embedding_response) -> np.ndarray: + response = json.loads(embedding_response.get("body").read()) + key = "embedding" if self._provider == "amazon" else "embeddings" + return np.array(response[key]) + + +amazon_titan_embed_text_v1 = ModelMeta( + name="bedrock/amazon-titan-embed-text-v1", + revision="1", + release_date="2023-09-27", + languages=None, # not specified + loader=partial( + BedrockWrapper, + model_id="amazon.titan-embed-text-v1", + provider="amazon", + max_tokens=8192, + ), + max_tokens=8192, + embed_dim=1536, + open_weights=False, + n_parameters=None, + public_training_code=None, + public_training_data=None, # assumed + training_datasets=None, + license=None, + reference="https://aws.amazon.com/about-aws/whats-new/2023/09/amazon-titan-embeddings-generally-available/", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=False, +) + +amazon_titan_embed_text_v2 = ModelMeta( + name="bedrock/amazon-titan-embed-text-v2", + revision="1", + release_date="2024-04-30", + languages=None, # not specified + loader=partial( + BedrockWrapper, + model_id="amazon.titan-embed-text-v2:0", + provider="amazon", + max_tokens=8192, + ), + max_tokens=8192, + embed_dim=1024, + open_weights=False, + n_parameters=None, + public_training_code=None, + public_training_data=None, # assumed + training_datasets=None, + license=None, + reference="https://aws.amazon.com/about-aws/whats-new/2024/04/amazon-titan-text-embeddings-v2-amazon-bedrock/", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=False, +) +# Note: For the original Cohere API implementation, refer to: +# https://github.com/embeddings-benchmark/mteb/blob/main/mteb/models/cohere_models.py +# This implementation uses the Amazon Bedrock endpoint for Cohere models. +cohere_embed_english_v3 = ModelMeta( + loader=partial( + BedrockWrapper, + model_id="cohere.embed-english-v3", + provider="cohere", + max_tokens=512, + model_prompts=cohere_model_prompts, + ), + name="bedrock/cohere-embed-english-v3", + languages=["eng-Latn"], + open_weights=False, + reference="https://cohere.com/blog/introducing-embed-v3", + revision="1", + release_date="2023-11-02", + n_parameters=None, + public_training_code=None, + public_training_data=None, # assumed + training_datasets=None, + max_tokens=512, + embed_dim=1024, + license=None, + similarity_fn_name="cosine", + framework=["API"], + use_instructions=True, +) + +cohere_embed_multilingual_v3 = ModelMeta( + loader=partial( + BedrockWrapper, + model_id="cohere.embed-multilingual-v3", + provider="cohere", + max_tokens=512, + model_prompts=cohere_model_prompts, + ), + name="bedrock/cohere-embed-multilingual-v3", + languages=cohere_supported_languages, + open_weights=False, + reference="https://cohere.com/blog/introducing-embed-v3", + revision="1", + release_date="2023-11-02", + n_parameters=None, + public_training_code=None, + public_training_data=None, # assumed + training_datasets=None, + max_tokens=512, + embed_dim=1024, + license=None, + similarity_fn_name="cosine", + framework=["API"], + use_instructions=True, +) diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py index 3eed189d33..3c18f9c27a 100644 --- a/mteb/models/e5_instruct.py +++ b/mteb/models/e5_instruct.py @@ -19,7 +19,6 @@ **E5_TRAINING_DATA, "FEVER": ["train"], "FEVERHardNegatives": ["train"], - "FEVER-PL": ["train"], # translation not trained on "HotpotQA": ["train"], "HotpotQAHardNegatives": ["train"], "HotpotQA-PL": ["train"], # translation not trained on diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py index 0ad15e7320..94d04ee483 100644 --- a/mteb/models/e5_models.py +++ b/mteb/models/e5_models.py @@ -130,7 +130,6 @@ **E5_TRAINING_DATA, "FEVER": ["train"], "FEVERHardNegatives": ["train"], - "FEVER-PL": ["train"], # translation not trained on "HotpotQA": ["train"], "HotpotQAHardNegatives": ["train"], "HotpotQA-PL": ["train"], # translation not trained on diff --git a/mteb/models/gritlm_models.py b/mteb/models/gritlm_models.py index 35d0543811..440779787b 100644 --- a/mteb/models/gritlm_models.py +++ b/mteb/models/gritlm_models.py @@ -16,7 +16,6 @@ # also uses medi2 which contains fever and hotpotqa: "FEVER": ["train"], "FEVERHardNegatives": ["train"], - "FEVER-PL": ["train"], # translation not trained on "HotpotQA": ["train"], "HotpotQAHardNegatives": ["train"], "HotpotQA-PL": ["train"], # translation not trained on diff --git a/mteb/models/instruct_wrapper.py b/mteb/models/instruct_wrapper.py index 2ee3a09b56..cc6e814629 100644 --- a/mteb/models/instruct_wrapper.py +++ b/mteb/models/instruct_wrapper.py @@ -6,6 +6,7 @@ import numpy as np import torch +from sentence_transformers import SentenceTransformer from mteb.encoder_interface import PromptType @@ -78,3 +79,87 @@ def encode( return embeddings return InstructWrapper(model_name_or_path, mode, instruction_template, **kwargs) + + +class InstructSentenceTransformerWrapper(Wrapper): + def __init__( + self, + model_name: str, + revision: str, + instruction_template: str | Callable[[str], str] | None = None, + max_seq_length: int | None = None, + apply_instruction_to_passages: bool = True, + padding_side: str | None = None, + add_eos_token: bool = False, + **kwargs: Any, + ): + """Instruct Sentence Transformer Wrapper. Wrapper that passes instructions to the Sentence Transformer model. + Applied for models like NV-Embed, gte-Qwen, e5-mistral, etc. + + Arguments: + model_name: Model name of the sentence transformers model. + revision: Revision of the sentence transformers model. + instruction_template: Model template. Should contain the string '{instruction}'. + max_seq_length: Maximum sequence length. If None, the maximum sequence length will be read from the model config. + apply_instruction_to_passages: Whether to apply the instruction template to the passages. + padding_side: Padding side. If None, the padding side will be read from the model config. + add_eos_token: Whether to add the eos token to each input example. + **kwargs: Kwargs for Sentence Transformer model. + """ + if ( + isinstance(instruction_template, str) + and "{instruction}" not in instruction_template + ): + raise ValueError( + "Instruction template must contain the string '{instruction}'." + ) + if instruction_template is None: + logger.warning( + "No instruction template provided. Instructions will be used as-is." + ) + + self.model_name = model_name + self.model = SentenceTransformer(model_name, revision=revision, **kwargs) + self.instruction_template = instruction_template + self.apply_instruction_to_passages = apply_instruction_to_passages + self.add_eos_token = add_eos_token + if max_seq_length is not None: + self.model.max_seq_length = max_seq_length + if padding_side is not None: + self.model.tokenizer.padding_side = padding_side + + def encode( + self, + sentences: Sequence[str], + *, + task_name: str, + prompt_type: PromptType | None = None, + **kwargs: Any, + ) -> np.ndarray: + if self.add_eos_token: + sentences = [ + example + self.model.tokenizer.eos_token for example in sentences + ] + + instruction = self.get_task_instruction(task_name, prompt_type) + + # to passage prompts won't be applied to passages + if not self.apply_instruction_to_passages and prompt_type == PromptType.passage: + instruction = None + logger.info( + f"No instruction used, because prompt type = {prompt_type.passage}" + ) + + if instruction: + logger.info(f"Using instruction: '{instruction}' for task: '{task_name}'") + + embeddings = self.model.encode( + sentences, + prompt=instruction, + **kwargs, + ) + + if isinstance(embeddings, torch.Tensor): + # sometimes in kwargs can be return_tensors=True + embeddings = embeddings.cpu().detach().float().numpy() + return embeddings diff --git a/mteb/models/instructions.py b/mteb/models/instructions.py deleted file mode 100644 index ef439e42bb..0000000000 --- a/mteb/models/instructions.py +++ /dev/null @@ -1,430 +0,0 @@ -"""This specifies the default instructions for tasks within MTEB. These are optional to use and some models might want to use their own instructions.""" - -from __future__ import annotations - -import mteb - -# Prompts from -# SEB: https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/blob/c8376f967d1294419be1d3eb41217d04cd3a65d3/src/seb/registered_models/e5_instruct_models.py -# E5: https://github.com/microsoft/unilm/blob/9c0f1ff7ca53431fe47d2637dfe253643d94185b/e5/utils.py#L106 -DEFAULT_PROMPTS = { - "STS": "Retrieve semantically similar text.", - "Summarization": "Given a news summary, retrieve other semantically similar summaries", - "BitextMining": "Retrieve parallel sentences.", - "Classification": "Classify user passages", - "Clustering": "Identify categories in user passages", - "Reranking": "Retrieve text based on user query.", - "Retrieval": "Retrieve text based on user query.", - "InstructionRetrieval": "Retrieve text based on user query.", - "PairClassification": "Retrieve text that are semantically similar to the given text", -} - - -# This list is NOT comprehensive even for the tasks within MTEB -# TODO: We should probably move this prompt to the task object -TASKNAME2INSTRUCTIONS = { - # BitextMining - "BornholmBitextMining": "Retrieve parallel sentences in Danish and Bornholmsk", - "NorwegianCourtsBitextMining ": "Retrieve parallel sentences in Norwegian Bokmรฅl and Nynorsk", - # Classification - "AngryTweetsClassification": "Classify Danish tweets by sentiment. (positive, negative, neutral)", - "DKHateClassification": "Classify Danish tweets based on offensiveness (offensive, not offensive)", - "DanishPoliticalCommentsClassification": "Classify Danish political comments for sentiment", - "DalajClassification": "Classify texts based on linguistic acceptability in Swedish", - "LccSentimentClassification": "Classify texts based on sentiment", - "NordicLangClassification": "Classify texts based on language", - "MassiveIntentClassification": "Given a user utterance as query, find the user intents", - "Massive Scenario": "Given a user utterance as query, find the user scenarios", - "NoRecClassification": "Classify Norwegian reviews by sentiment", - "SweRecClassification": "Classify Swedish reviews by sentiment", - "Norwegian parliament": "Classify parliament speeches in Norwegian based on political affiliation", - "ScalaClassification": "Classify passages in Scandinavian Languages based on linguistic acceptability", - "AmazonCounterfactualClassification": "Classify a given Amazon customer review text as either counterfactual or not-counterfactual", - "AmazonPolarityClassification": "Classify Amazon reviews into positive or negative sentiment", - "AmazonReviewsClassification": "Classify the given Amazon review into its appropriate rating category", - "Banking77Classification": "Given a online banking query, find the corresponding intents", - "EmotionClassification": "Classify the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise", - "ImdbClassification": "Classify the sentiment expressed in the given movie review text from the IMDB dataset", - "MassiveScenarioClassification": "Given a user utterance as query, find the user scenarios", - "MTOPDomainClassification": "Classify the intent domain of the given utterance in task-oriented conversation", - "MTOPIntentClassification": "Classify the intent of the given utterance in task-oriented conversation", - "ToxicConversationsClassification": "Classify the given comments as either toxic or not toxic", - "TweetSentimentExtractionClassification": "Classify the sentiment of a given tweet as either positive, negative, or neutral", - "TNews": "Classify the fine-grained category of the given news title", - "IFlyTek": "Given an App description text, find the appropriate fine-grained category", - "MultilingualSentiment": "Classify sentiment of the customer review into positive, neutral, or negative", - "JDReview": "Classify the customer review for iPhone on e-commerce platform into positive or negative", - "OnlineShopping": "Classify the customer review for online shopping into positive or negative", - "Waimai": "Classify the customer review from a food takeaway platform into positive or negative", - "RuReviewsClassification": "Classify product reviews into positive, negative or neutral sentiment", - "KinopoiskClassification": "Classify the sentiment expressed in the given movie review text", - "HeadlineClassification": "Classify the topic or theme of the given news headline", - "CEDRClassification": "Given a comment as query, find expressed emotions (joy, sadness, surprise, fear, and anger)", - "GeoreviewClassification": "Classify the organization rating based on the reviews", - "InappropriatenessClassification": "Classify the given message as either sensitive topic or not", - "RuSciBenchGRNTIClassification": "Classify the category of scientific papers based on the titles and abstracts", - "RuSciBenchOECDClassification": "Classify the category of scientific papers based on the titles and abstracts", - "SensitiveTopicsClassification": "Given a sentence as query, find sensitive topics", - # Clustering - "VGHierarchicalClusteringP2P": "Identify the categories (e.g. sports) of given articles in Norwegian", - "VGHierarchicalClusteringS2S": "Identify the categories (e.g. sports) of given articles in Norwegian", - "SNLHierarchicalClusteringP2P": "Identify categories in a Norwegian lexicon", - "SNLHierarchicalClusteringS2S": "Identify categories in a Norwegian lexicon", - "SwednClusteringP2P": "Identify news categories in Swedish passages", - "SwednClusteringS2S": "Identify news categories in Swedish passages", - "ArxivClusteringP2P": "Identify the main and secondary category of Arxiv papers based on the titles and abstracts", - "ArxivClusteringS2S": "Identify the main and secondary category of Arxiv papers based on the titles", - "BiorxivClusteringP2P": "Identify the main category of Biorxiv papers based on the titles and abstracts", - "BiorxivClusteringS2S": "Identify the main category of Biorxiv papers based on the titles", - "MedrxivClusteringP2P": "Identify the main category of Medrxiv papers based on the titles and abstracts", - "MedrxivClusteringS2S": "Identify the main category of Medrxiv papers based on the titles", - "RedditClustering": "Identify the topic or theme of Reddit posts based on the titles", - "RedditClusteringP2P": "Identify the topic or theme of Reddit posts based on the titles and posts", - "StackExchangeClustering": "Identify the topic or theme of StackExchange posts based on the titles", - "StackExchangeClusteringP2P": "Identify the topic or theme of StackExchange posts based on the given paragraphs", - "TwentyNewsgroupsClustering": "Identify the topic or theme of the given news articles", - "CLSClusteringS2S": "Identify the main category of scholar papers based on the titles", - "CLSClusteringP2P": "Identify the main category of scholar papers based on the titles and abstracts", - "ThuNewsClusteringS2S": "Identify the topic or theme of the given news articles based on the titles", - "ThuNewsClusteringP2P": "Identify the topic or theme of the given news articles based on the titles and contents", - "GeoreviewClusteringP2P": "Identify the organization category based on the reviews", - "RuSciBenchOECDClusteringP2P": "Identify the category of scientific papers based on the titles and abstracts", - "RuSciBenchGRNTIClusteringP2P": "Identify the category of scientific papers based on the titles and abstracts", - # Reranking and pair classification - "AskUbuntuDupQuestions": "Retrieve duplicate questions from AskUbuntu forum", - "MindSmallReranking": "Retrieve relevant news articles based on user browsing history", - "SciDocsRR": "Given a title of a scientific paper, retrieve the titles of other relevant papers", - "StackOverflowDupQuestions": "Retrieve duplicate questions from StackOverflow forum", - "SprintDuplicateQuestions": "Retrieve duplicate questions from Sprint forum", - "TwitterSemEval2015": "Retrieve tweets that are semantically similar to the given tweet", - "TwitterURLCorpus": "Retrieve tweets that are semantically similar to the given tweet", - "T2Reranking": "Given a Chinese search query, retrieve web passages that answer the question", - "MMarcoReranking": "Given a Chinese search query, retrieve web passages that answer the question", - "VoyageMMarcoReranking": "Given a Japanese search query, retrieve web passages that answer the question", - "CMedQAv1": "Given a Chinese community medical question, retrieve replies that best answer the question", - "CMedQAv2": "Given a Chinese community medical question, retrieve replies that best answer the question", - "Ocnli": "Retrieve semantically similar text.", - "Cmnli": "Retrieve semantically similar text.", - "TERRa": "Given a premise, retrieve a hypothesis that is entailed by the premise", - "RuBQReranking": ( - "Given a question, retrieve Wikipedia passages that answer the question", - "", - ), - "MIRACLReranking": ( - "Given a question, retrieve Wikipedia passages that answer the question", - "", - ), - # Retrieval - 1st item is query instruction; 2nd is corpus instruction - "TwitterHjerneRetrieval": ( - "Retrieve answers to questions asked in Danish tweets", - "", - ), - "SwednRetrieval": ( - "Given a Swedish news headline retrieve summaries or news articles", - "", - ), - "TV2Nordretrieval": ( - "Given a summary of a Danish news article retrieve the corresponding news article", - "", - ), - "DanFEVER": ( - "Given a claim in Danish, retrieve documents that support the claim", - "", - ), - "SNLRetrieval": ("Given a lexicon headline in Norwegian, retrieve its article", ""), - "NorQuadRetrieval": ( - "Given a question in Norwegian, retrieve the answer from Wikipedia articles", - "", - ), - "SweFaqRetrieval": ("Retrieve answers given questions in Swedish", ""), - "ArguAna": ("Given a claim, find documents that refute the claim", ""), - "ClimateFEVER": ( - "Given a claim about climate change, retrieve documents that support or refute the claim", - "", - ), - "DBPedia": ( - "Given a query, retrieve relevant entity descriptions from DBPedia", - "", - ), - "FEVER": ("Given a claim, retrieve documents that support or refute the claim", ""), - "FiQA2018": ( - "Given a financial question, retrieve user replies that best answer the question", - "", - ), - "HotpotQA": ( - "Given a multi-hop question, retrieve documents that can help answer the question", - "", - ), - "MSMARCO": ( - "Given a web search query, retrieve relevant passages that answer the query", - "", - ), - "NFCorpus": ( - "Given a question, retrieve relevant documents that best answer the question", - "", - ), - "NQ": ( - "Given a question, retrieve Wikipedia passages that answer the question", - "", - ), - "QuoraRetrieval": ( - "Given a question, retrieve questions that are semantically equivalent to the given question", - "", - ), - "SCIDOCS": ( - "Given a scientific paper title, retrieve paper abstracts that are cited by the given paper", - "", - ), - "SciFact": ( - "Given a scientific claim, retrieve documents that support or refute the claim", - "", - ), - "Touche2020": ( - "Given a question, retrieve detailed and persuasive arguments that answer the question", - "", - ), - "TRECCOVID": ( - "Given a query on COVID-19, retrieve documents that answer the query", - "", - ), - "T2Retrieval": ( - "Given a Chinese search query, retrieve web passages that answer the question", - "", - ), - "MMarcoRetrieval": ( - "Given a web search query, retrieve relevant passages that answer the query", - "", - ), - "DuRetrieval": ( - "Given a Chinese search query, retrieve web passages that answer the question", - "", - ), - "CovidRetrieval": ( - "Given a question on COVID-19, retrieve news articles that answer the question", - "", - ), - "CmedqaRetrieval": ( - "Given a Chinese community medical question, retrieve replies that best answer the question", - "", - ), - "EcomRetrieval": ( - "Given a user query from an e-commerce website, retrieve description sentences of relevant products", - "", - ), - "MedicalRetrieval": ( - "Given a medical question, retrieve user replies that best answer the question", - "", - ), - "VideoRetrieval": ( - "Given a video search query, retrieve the titles of relevant videos", - "", - ), - "ARCChallenge": ( - "Retrieve the answer to the question.", - "", - ), - "AlphaNLI": ( - "Given the following start and end of a story, retrieve a possible reason that leads to the end.", - "", - ), - "HellaSwag": ( - "Given the following unfinished context, retrieve the most plausible ending to finish it.", - "", - ), - "PIQA": ( - "Given the following goal, retrieve a possible solution.", - "", - ), - "Quail": ( - "Given the following context and question, retrieve the correct answer.", - "", - ), - "SIQA": ( - "Given the following context and question, retrieve the correct answer.", - "", - ), - "RARbCode": ( - "Retrieve the answer for the following coding problem.", - "", - ), - "RARbMath": ( - "Retrieve the answer for the following math problem.", - "", - ), - "SpartQA": ( - "Given the following spatial reasoning question, retrieve the right answer.", - "", - ), - "TempReasonL1": ( - "Given the following question about time, retrieve the correct answer.", - "", - ), - "TempReasonL2Pure": ( - "Given the following question, retrieve the correct answer.", - "", - ), - "TempReasonL2Fact": ( - "Given the following question and facts, retrieve the correct answer.", - "", - ), - "TempReasonL2Context": ( - "Given the following question, facts and contexts, retrieve the correct answer.", - "", - ), - "TempReasonL3Pure": ( - "Given the following question, retrieve the correct answer.", - "", - ), - "TempReasonL3Fact": ( - "Given the following question and facts, retrieve the correct answer.", - "", - ), - "TempReasonL3Context": ( - "Given the following question, facts and contexts, retrieve the correct answer.", - "", - ), - "WinoGrande": ( - "Given the following sentence, retrieve an appropriate answer to fill in the missing underscored part.", - "", - ), - "RuBQRetrieval": ( - "Given a question, retrieve Wikipedia passages that answer the question", - "", - ), - "MIRACLRetrieval": ( - "Given a question, retrieve Wikipedia passages that answer the question", - "", - ), - "RiaNewsRetrieval": ("Given a news title, retrieve relevant news article", ""), - # Any2Any Retrieval - "WebQAT2TRetrieval": ( - "Retrieve passages from Wikipedia that provide answers to the following question.", - "", - ), - "NIGHTSI2IRetrieval": ( - "Find a day-to-day image that looks similar to the provided image.", - "", - ), - "VisualNewsT2IRetrieval": ( - "Identify the news-related image in line with the described event.", - "", - ), - "Fashion200kT2IRetrieval": ( - "Based on the following fashion description, retrieve the best matching image.", - "", - ), - "MSCOCOT2IRetrieval": ( - "Identify the image showcasing the described everyday scene.", - "", - ), - "Flickr30kT2IRetrieval": ("Find an image that matches the given caption.", ""), - "VidoreTatdqaRetrieval": ( - "Find a screenshot that relevant to the user's question.", - "", - ), - "VidoreArxivQARetrieval": ( - "Find a screenshot that relevant to the user's question.", - "", - ), - "VidoreDocVQARetrieval": ( - "Find a screenshot that relevant to the user's question.", - "", - ), - "VidoreInfoVQARetrieval": ( - "Find a screenshot that relevant to the user's question.", - "", - ), - "VidoreShiftProjectRetrieval": ( - "Find a screenshot that relevant to the user's question.", - "", - ), - "VidoreSyntheticDocQAAIRetrieval": ( - "Find a screenshot that relevant to the user's question.", - "", - ), - "VidoreSyntheticDocQAGovernmentReportsRetrieval": ( - "Find a screenshot that relevant to the user's question.", - "", - ), - "VidoreSyntheticDocQAHealthcareIndustryRetrieval": ( - "Find a screenshot that relevant to the user's question.", - "", - ), - "VidoreSyntheticDocQAEnergyRetrieval": ( - "Find a screenshot that relevant to the user's question.", - "", - ), - "VidoreTabfquadRetrieval": ( - "Find a screenshot that relevant to the user's question.", - "", - ), - "VisualNewsI2TRetrieval": ("Find a caption for the news in the given photo.", ""), - "Fashion200kI2TRetrieval": ( - "Based on the following fashion description, retrieve the best matching image.", - "", - ), - "MSCOCOI2TRetrieval": ( - "Find an image caption describing the following everyday image.", - "", - ), - "Flickr30kI2TRetrieval": ( - "Find an image caption describing the following image.", - "", - ), - "WebQAT2ITRetrieval": ("Find a Wikipedia image that answers this question.", ""), - "EDIST2ITRetrieval": ("Identify the news photo for the given caption.", ""), - "OVENIT2TRetrieval": ( - "Retrieve a Wikipedia paragraph that provides an answer to the given query about the image.", - "", - ), - "InfoSeekIT2TRetrieval": ( - "Find a paragraph from Wikipedia that answers my question about this image.", - "", - ), - "ReMuQIT2TRetrieval": ( - "Retrieve a fact-based paragraph that provides an answer to the given query about the image.", - "", - ), - "OKVQAIT2TRetrieval": ( - "Retrieve documents that provide an answer to the question alongside the image.", - "", - ), - "LLaVAIT2TRetrieval": ( - "Provide a specific decription of the image along with the following question.", - "", - ), - "FashionIQIT2IRetrieval": ( - "Find a fashion image that aligns with the reference image and style note.", - "", - ), - "CIRRIT2IRetrieval": ( - "Retrieve a day-to-day image that aligns with the modification instructions of the provided image.", - "", - ), - "OVENIT2ITRetrieval": ( - "Retrieve a Wikipedia image-description pair that provides evidence for the question of this image.", - "", - ), - "InfoSeekIT2ITRetrieval": ( - "Find an image and subject description from Wikipedia that answers my question about this image.", - "", - ), - "EncyclopediaVQAIT2ITRetrieval": ( - "Obtain illustrated documents that correspond to the inquiry alongside the provided image.", - "", - ), -} - - -def task_to_instruction(task_name: str, is_query: bool = True) -> str: - if task_name in TASKNAME2INSTRUCTIONS: - if isinstance(TASKNAME2INSTRUCTIONS[task_name], tuple): - return ( - TASKNAME2INSTRUCTIONS[task_name][0] - if is_query - else TASKNAME2INSTRUCTIONS[task_name][1] - ) - return TASKNAME2INSTRUCTIONS[task_name] - - meta = mteb.get_task(task_name).metadata - return DEFAULT_PROMPTS.get(meta.type, "") diff --git a/mteb/models/jasper_models.py b/mteb/models/jasper_models.py index dbd1615ad8..d0ff4ab681 100644 --- a/mteb/models/jasper_models.py +++ b/mteb/models/jasper_models.py @@ -90,8 +90,17 @@ def encode( use_instructions=True, adapted_from=None, superseded_by=None, - training_datasets=nvidia_training_datasets, # "In jasper model the teacher model is nvidia/NV-Embed-v2", source https://huggingface.co/infgrad/jasper_en_vision_language_v1 - # "non_mteb": ["BAAI/Infinity-MM", "HuggingFaceFW/fineweb-edu"], - public_training_code=None, - public_training_data=None, + training_datasets={ + # stage 1, 2, 3 + # "In jasper model the teacher model is nvidia/NV-Embed-v2", source https://huggingface.co/infgrad/jasper_en_vision_language_v1 + **nvidia_training_datasets, + # fineweb-edu + # https://huggingface.co/datasets/sentence-transformers/embedding-training-data + # stage 4 + # BAAI/Infinity-MM + }, + # training logs https://api.wandb.ai/links/dunnzhang0/z8jqoqpb + # more codes https://huggingface.co/NovaSearch/jasper_en_vision_language_v1/commit/da9b77d56c23d9398fa8f93af449102784f74e1d + public_training_code="https://github.com/NovaSearch-Team/RAG-Retrieval/blob/c40f4638b705eb77d88305d2056901ed550f9f4b/rag_retrieval/train/embedding/README.md", + public_training_data="https://huggingface.co/datasets/infgrad/jasper_text_distill_dataset", ) diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index e855ad3c7a..00641e9c89 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -245,6 +245,12 @@ def encode( jina_embeddings_v2_base_en = ModelMeta( + loader=partial( + SentenceTransformerWrapper, + model_name="jinaai/jina-embeddings-v2-base-en", + revision="6e85f575bc273f1fd840a658067d0157933c83f0", + trust_remote_code=True, + ), name="jinaai/jina-embeddings-v2-base-en", languages=["eng-Latn"], open_weights=True, @@ -266,6 +272,12 @@ def encode( ) jina_embeddings_v2_small_en = ModelMeta( + loader=partial( + SentenceTransformerWrapper, + model_name="jinaai/jina-embeddings-v2-small-en", + revision="796cff318cdd4e5fbe8b7303a1ef8cbec36996ef", + trust_remote_code=True, + ), name="jinaai/jina-embeddings-v2-small-en", languages=["eng-Latn"], open_weights=True, @@ -287,6 +299,12 @@ def encode( ) jina_embedding_b_en_v1 = ModelMeta( + loader=partial( + SentenceTransformerWrapper, + model_name="jinaai/jina-embedding-b-en-v1", + revision="aa0645035294a8c0607ce5bb700aba982cdff32c", + trust_remote_code=True, + ), name="jinaai/jina-embedding-b-en-v1", languages=["eng-Latn"], open_weights=True, @@ -308,6 +326,12 @@ def encode( ) jina_embedding_s_en_v1 = ModelMeta( + loader=partial( + SentenceTransformerWrapper, + model_name="jinaai/jina-embedding-s-en-v1", + revision="c1fed70aa4823a640f1a7150a276e4d3b08dce08", + trust_remote_code=True, + ), name="jinaai/jina-embedding-s-en-v1", languages=["eng-Latn"], open_weights=True, diff --git a/mteb/models/lens_models.py b/mteb/models/lens_models.py index 2cf055abd4..380724e53e 100644 --- a/mteb/models/lens_models.py +++ b/mteb/models/lens_models.py @@ -2,6 +2,8 @@ from mteb.model_meta import ModelMeta +from .bge_models import bge_full_data + lens_d4000 = ModelMeta( loader=None, # TODO: implement this in the future name="yibinlei/LENS-d4000", @@ -17,8 +19,8 @@ framework=["PyTorch"], use_instructions=True, public_training_code=None, - public_training_data=None, - training_datasets=None, + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", + training_datasets=bge_full_data, max_tokens=32768, ) @@ -37,7 +39,7 @@ framework=["PyTorch"], use_instructions=True, public_training_code=None, - public_training_data=None, - training_datasets=None, + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", + training_datasets=bge_full_data, max_tokens=32768, ) diff --git a/mteb/models/nvidia_models.py b/mteb/models/nvidia_models.py index 1997a85274..f3b313356a 100644 --- a/mteb/models/nvidia_models.py +++ b/mteb/models/nvidia_models.py @@ -1,17 +1,11 @@ from __future__ import annotations import logging -from collections.abc import Sequence from functools import partial -from typing import Any - -import numpy as np -import torch -from sentence_transformers import CrossEncoder, SentenceTransformer from mteb.encoder_interface import PromptType from mteb.model_meta import ModelMeta -from mteb.models.sentence_transformer_wrapper import SentenceTransformerWrapper +from mteb.models.instruct_wrapper import InstructSentenceTransformerWrapper logger = logging.getLogger(__name__) @@ -22,56 +16,6 @@ def instruction_template( return f"Instruct: {instruction}\nQuery: " if instruction else "" -class NvEmbedWrapper(SentenceTransformerWrapper): - def __init__( - self, - model: str | SentenceTransformer | CrossEncoder, - revision: str | None = None, - model_prompts: dict[str, str] | None = None, - **kwargs, - ) -> None: - super().__init__(model, revision, model_prompts, **kwargs) - self.model.max_seq_length = 32768 - self.model.tokenizer.padding_side = "right" - logger.warning( - "Instructions are used in both query and docs, which may cause performance discrepancies from the original implementation." - ) - - def encode( - self, - sentences: Sequence[str], - *, - task_name: str, - prompt_type: PromptType | None = None, - **kwargs: Any, - ) -> np.ndarray: - # Add eos token to each input example - sentences = [example + self.model.tokenizer.eos_token for example in sentences] - - instruction = "" - if prompt_type == PromptType.query: - instruction = self.get_instruction(task_name, prompt_type) - - prompt = instruction_template(instruction) - - if prompt: - logger.info(f"Using {prompt=} for task={task_name} {prompt_type=}") - else: - logger.info(f"No model prompts found for task={task_name} {prompt_type=}") - - logger.info(f"Encoding {len(sentences)} sentences.") - - embeddings = self.model.encode( - sentences, - prompt=prompt, - normalize_embeddings=True, - **kwargs, - ) - if isinstance(embeddings, torch.Tensor): - embeddings = embeddings.cpu().detach().float().numpy() - return embeddings - - nvidia_training_datasets = { # source: https://arxiv.org/pdf/2405.17428 "ArguAna": ["train"], @@ -120,11 +64,18 @@ def encode( "STSBenchmark": ["train"], "STSBenchmarkMultilingualSTS": ["train"], # translated, not trained on } + NV_embed_v2 = ModelMeta( loader=partial( # type: ignore - NvEmbedWrapper, + InstructSentenceTransformerWrapper, model="nvidia/NV-Embed-v2", + revision="7604d305b621f14095a1aa23d351674c2859553a", + instruction_template=instruction_template, trust_remote_code=True, + max_seq_length=32768, + padding_side="right", + # for nv-embed, we add eos token to each input example + add_eos_token=True, ), name="nvidia/NV-Embed-v2", languages=["eng_Latn"], @@ -146,9 +97,15 @@ def encode( NV_embed_v1 = ModelMeta( loader=partial( # type: ignore - NvEmbedWrapper, + InstructSentenceTransformerWrapper, model="nvidia/NV-Embed-v1", + revision="7604d305b621f14095a1aa23d351674c2859553a", + instruction_template=instruction_template, trust_remote_code=True, + max_seq_length=32768, + padding_side="right", + # for nv-embed, we add eos token to each input example + add_eos_token=True, ), name="nvidia/NV-Embed-v1", languages=["eng_Latn"], diff --git a/mteb/models/overview.py b/mteb/models/overview.py index 7535c8939a..9137da2a79 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -14,6 +14,7 @@ from mteb.models import ( align_models, arctic_models, + bedrock_models, bge_models, blip2_models, blip_models, @@ -74,6 +75,7 @@ model_modules = [ align_models, arctic_models, + bedrock_models, bge_models, blip2_models, blip_models, diff --git a/mteb/models/salesforce_models.py b/mteb/models/salesforce_models.py index 235057a6f8..8c72265cc9 100644 --- a/mteb/models/salesforce_models.py +++ b/mteb/models/salesforce_models.py @@ -22,7 +22,6 @@ def instruction_template( "FiQA2018-PL": ["train"], "FEVER": ["train"], "FEVERHardNegatives": ["train"], - "FEVER-PL": ["train"], # translation not trained on "HotpotQA": ["train"], "HotpotQAHardNegatives": ["train"], "HotpotQA-PL": ["train"], # translation not trained on diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index 92d5db7c8a..9cc45a6e02 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -29,8 +29,7 @@ framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_400M_v5", training_datasets=None, - # will be at https://github.com/NLPJCL/RAG-Retrieval - public_training_code=None, + public_training_code="https://github.com/NovaSearch-Team/RAG-Retrieval/blob/c40f4638b705eb77d88305d2056901ed550f9f4b/rag_retrieval/train/embedding/README.md", public_training_data=None, ) @@ -56,9 +55,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_1.5B_v5", - # will be at https://github.com/NLPJCL/RAG-Retrieval training_datasets=None, - public_training_code=None, + public_training_code="https://github.com/NovaSearch-Team/RAG-Retrieval/blob/c40f4638b705eb77d88305d2056901ed550f9f4b/rag_retrieval/train/embedding/README.md", public_training_data=None, ) diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index a637dee36a..3bcfb997bf 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -361,17 +361,17 @@ def _batched_encode( voyage_3_exp = ModelMeta( name="voyageai/voyage-3-m-exp", revision="1", - release_date=None, # not released - languages=None, # supported languages not specified + release_date="2025-01-08", + languages=["eng-Latn"], loader=partial( VoyageWrapper, model_name="voyage-3-m-exp", model_prompts=model_prompts, ), max_tokens=32000, - embed_dim=512, + embed_dim=2048, open_weights=False, - n_parameters=None, + n_parameters=int(6918 * 1e6), license=None, reference="https://huggingface.co/voyageai/voyage-3-m-exp", similarity_fn_name="cosine", diff --git a/mteb/tasks/BitextMining/__init__.py b/mteb/tasks/BitextMining/__init__.py index c176077215..1cec5d5ddc 100644 --- a/mteb/tasks/BitextMining/__init__.py +++ b/mteb/tasks/BitextMining/__init__.py @@ -1,6 +1,7 @@ from __future__ import annotations from .dan.BornholmskBitextMining import * +from .eng.PubChemSMILESBitextMining import * from .kat.TbilisiCityHallBitextMining import * from .multilingual.BibleNLPBitextMining import * from .multilingual.BUCCBitextMining import * diff --git a/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py b/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py new file mode 100644 index 0000000000..4951d8c596 --- /dev/null +++ b/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +COL_MAPPING = { + "iso-title": {"title": "sentence1", "isomeric_smiles": "sentence2"}, + "iso-desc": {"description": "sentence1", "isomeric_smiles": "sentence2"}, + "canon-title": {"title": "sentence1", "canonical_smiles": "sentence2"}, + "canon-desc": {"description": "sentence1", "canonical_smiles": "sentence2"}, +} + +EVAL_LANGS = { + "iso-title": ["eng-Latn", "eng-Latn"], + "iso-desc": ["eng-Latn", "eng-Latn"], + "canon-title": ["eng-Latn", "eng-Latn"], + "canon-desc": ["eng-Latn", "eng-Latn"], +} + + +class PubChemSMILESBitextMining(MultilingualTask, AbsTaskBitextMining): + metadata = TaskMetadata( + name="PubChemSMILESBitextMining", + dataset={ + "path": "BASF-AI/PubChemSMILESBitextMining", + "revision": "36700ea628118312ebf2f90ad2353a9a8f188dc9", + }, + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + type="BitextMining", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=EVAL_LANGS, + main_score="f1", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + @article{kim2023pubchem, + title={PubChem 2023 update}, + author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal={Nucleic acids research}, + volume={51}, + number={D1}, + pages={D1373--D1380}, + year={2023}, + publisher={Oxford University Press} + } + """, + ) + + def dataset_transform(self): + for subset in self.hf_subsets: + self.dataset[subset] = self.dataset[subset].rename_columns( + COL_MAPPING[subset] + ) diff --git a/mteb/tasks/BitextMining/eng/__init__.py b/mteb/tasks/BitextMining/eng/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py index 3e80ae2181..b2aab22714 100644 --- a/mteb/tasks/Classification/__init__.py +++ b/mteb/tasks/Classification/__init__.py @@ -33,13 +33,31 @@ from .eng.NewsClassification import * from .eng.PatentClassification import * from .eng.PoemSentimentClassification import * +from .eng.SDSEyeProtectionClassification import * +from .eng.SDSGlovesClassification import * from .eng.ToxicChatClassification import * from .eng.ToxicConversationsClassification import * from .eng.TweetSentimentExtractionClassification import * from .eng.TweetTopicSingleClassification import * +from .eng.WikipediaBiolumNeurochemClassification import * +from .eng.WikipediaBioMetChemClassification import * +from .eng.WikipediaChemEngSpecialtiesClassification import * +from .eng.WikipediaChemFieldsClassification import * +from .eng.WikipediaChemistryTopicsClassification import * +from .eng.WikipediaCompChemSpectroscopyClassification import * +from .eng.WikipediaCryobiologySeparationClassification import * +from .eng.WikipediaCrystallographyAnalyticalClassification import * +from .eng.WikipediaGreenhouseEnantiopureClassification import * +from .eng.WikipediaIsotopesFissionClassification import * +from .eng.WikipediaLuminescenceClassification import * +from .eng.WikipediaOrganicInorganicClassification import * +from .eng.WikipediaSaltsSemiconductorsClassification import * +from .eng.WikipediaSolidStateColloidalClassification import * +from .eng.WikipediaTheoreticalAppliedClassification import * from .eng.YahooAnswersTopicsClassification import * from .eng.YelpReviewFullClassification import * from .est.estonian_valence import * +from .fas.FaMTEBClassification import * from .fas.PersianFoodSentimentClassification import * from .fil.FilipinoHateSpeechClassification import * from .fil.FilipinoShopeeReviewsClassification import * diff --git a/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py b/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py index 6ddb37c42a..b9abb5445a 100644 --- a/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py +++ b/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py @@ -22,7 +22,7 @@ class FinancialPhrasebankClassification(AbsTaskClassification): eval_langs=["eng-Latn"], main_score="accuracy", date=("2013-11-01", "2013-11-01"), - domains=["News", "Written"], + domains=["News", "Written", "Financial"], task_subtypes=["Sentiment/Hate speech"], license="cc-by-nc-sa-3.0", annotations_creators="expert-annotated", diff --git a/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py b/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py new file mode 100644 index 0000000000..197060ba0c --- /dev/null +++ b/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SDSEyeProtectionClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SDSEyeProtectionClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/SDSEyeProtectionClassification", + "revision": "35cbe5ee544dd26e343238a333de4568e6f77819", + }, + type="Classification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="LM-generated and reviewed", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + @inproceedings{pereira2020msds, + title={MSDS-OPP: Operator Procedures Prediction in Material Safety Data Sheets}, + author={Pereira, Eliseu}, + booktitle={15th Doctoral Symposium}, + pages={42}, + year={2020} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/SDSGlovesClassification.py b/mteb/tasks/Classification/eng/SDSGlovesClassification.py new file mode 100644 index 0000000000..ac471d58e9 --- /dev/null +++ b/mteb/tasks/Classification/eng/SDSGlovesClassification.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SDSGlovesClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SDSGlovesClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/SDSGlovesClassification", + "revision": "c723236c5ec417d79512e6104aca9d2cd88168f6", + }, + type="Classification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="LM-generated and reviewed", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + @inproceedings{pereira2020msds, + title={MSDS-OPP: Operator Procedures Prediction in Material Safety Data Sheets}, + author={Pereira, Eliseu}, + booktitle={15th Doctoral Symposium}, + pages={42}, + year={2020} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py b/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py new file mode 100644 index 0000000000..3b494f46f6 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaBioMetChemClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaBioMetChemClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEasy2GeneExpressionVsMetallurgyClassification", + "revision": "6ac491e5de9070c6dd434b31e76d3d379123dcff", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaBiolumNeurochemClassification.py b/mteb/tasks/Classification/eng/WikipediaBiolumNeurochemClassification.py new file mode 100644 index 0000000000..623ec8fc66 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaBiolumNeurochemClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaBiolumNeurochemClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaBiolumNeurochemClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaMedium2BioluminescenceVsNeurochemistryClassification", + "revision": "2f68b7d34c2be896e46b14533573b366e59e5aae", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaChemEngSpecialtiesClassification.py b/mteb/tasks/Classification/eng/WikipediaChemEngSpecialtiesClassification.py new file mode 100644 index 0000000000..c95abcd4f2 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaChemEngSpecialtiesClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaChemEngSpecialtiesClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaChemEngSpecialtiesClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaMedium5Classification", + "revision": "f81a76a2fb690e5d5bd7a26dd07e85cdf8405dfb", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py b/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py new file mode 100644 index 0000000000..7c0179fb1e --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaChemFieldsClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaChemFieldsClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEZ10Classification", + "revision": "a75fae77759acc115f015f2b856baa47776d733d", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaChemistryTopicsClassification.py b/mteb/tasks/Classification/eng/WikipediaChemistryTopicsClassification.py new file mode 100644 index 0000000000..02751b1a32 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaChemistryTopicsClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaChemistryTopicsClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaChemistryTopicsClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEasy10Classification", + "revision": "d8fb355db2248f95df8ea410a43aa1db1ee96ba4", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py b/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py new file mode 100644 index 0000000000..28a42ac044 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaCompChemSpectroscopyClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaCompChemSpectroscopyClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaMedium2ComputationalVsSpectroscopistsClassification", + "revision": "474d706a22b0451b5846d623aa4b4234ba5b0513", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaCryobiologySeparationClassification.py b/mteb/tasks/Classification/eng/WikipediaCryobiologySeparationClassification.py new file mode 100644 index 0000000000..0e01454298 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaCryobiologySeparationClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaCryobiologySeparationClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaCryobiologySeparationClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEasy5Classification", + "revision": "858633e882dadd1ec6a0d220f7549bcafd379236", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py b/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py new file mode 100644 index 0000000000..724ffc4249 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaCrystallographyAnalyticalClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaCrystallographyAnalyticalClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification", + "revision": "740565a6a853aaed1114a13bdfd5fd46857b4f11", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaGreenhouseEnantiopureClassification.py b/mteb/tasks/Classification/eng/WikipediaGreenhouseEnantiopureClassification.py new file mode 100644 index 0000000000..b701584a70 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaGreenhouseEnantiopureClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaGreenhouseEnantiopureClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaGreenhouseEnantiopureClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEasy2GreenhouseVsEnantiopureClassification", + "revision": "0cfc1a83b6ed832454e8f4f93f7a0e26208274d9", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaIsotopesFissionClassification.py b/mteb/tasks/Classification/eng/WikipediaIsotopesFissionClassification.py new file mode 100644 index 0000000000..252ad85ed9 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaIsotopesFissionClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaIsotopesFissionClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaIsotopesFissionClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification", + "revision": "897743346c7c794264f7dbfadc3978aa2895e8e2", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaLuminescenceClassification.py b/mteb/tasks/Classification/eng/WikipediaLuminescenceClassification.py new file mode 100644 index 0000000000..8e115b59d4 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaLuminescenceClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaLuminescenceClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaLuminescenceClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaHard2BioluminescenceVsLuminescenceClassification", + "revision": "21c4dcebe2c5b36a35292e6441e7a10b59bf4896", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaOrganicInorganicClassification.py b/mteb/tasks/Classification/eng/WikipediaOrganicInorganicClassification.py new file mode 100644 index 0000000000..0ad784b69b --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaOrganicInorganicClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaOrganicInorganicClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaOrganicInorganicClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEasy2SpecialClassification", + "revision": "96d1d9b37c4693f74c46c83d63a290573f78d511", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaSaltsSemiconductorsClassification.py b/mteb/tasks/Classification/eng/WikipediaSaltsSemiconductorsClassification.py new file mode 100644 index 0000000000..a409f87c8d --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaSaltsSemiconductorsClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaSaltsSemiconductorsClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaSaltsSemiconductorsClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaHard2SaltsVsSemiconductorMaterialsClassification", + "revision": "9e5415a096012fa2d1f3a929952cf9859e4550e7", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaSolidStateColloidalClassification.py b/mteb/tasks/Classification/eng/WikipediaSolidStateColloidalClassification.py new file mode 100644 index 0000000000..43f95c50f3 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaSolidStateColloidalClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaSolidStateColloidalClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaSolidStateColloidalClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEasy2SolidStateVsColloidalClassification", + "revision": "7d8df44e588b6143d4856c781f72f919fa0599a7", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py b/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py new file mode 100644 index 0000000000..f33b02f4bb --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaTheoreticalAppliedClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaTheoreticalAppliedClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEZ2Classification", + "revision": "7896906653d31d7102a143d7f55d67cd688e3147", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/fas/FaMTEBClassification.py b/mteb/tasks/Classification/fas/FaMTEBClassification.py new file mode 100644 index 0000000000..43c7971429 --- /dev/null +++ b/mteb/tasks/Classification/fas/FaMTEBClassification.py @@ -0,0 +1,635 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SynPerChatbotConvSAAnger(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSAAnger", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Anger", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-anger", + "revision": "5cae68b7fc094cb2fa6890a464e4d836e8107f5e", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSASatisfaction(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSASatisfaction", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Satisfaction", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-satisfaction", + "revision": "50fd9d5d09edd53af89af765636be5db6f983f0e", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSAFriendship(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSAFriendship", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Friendship", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-friendship", + "revision": "9dae119101e9b4e9bb40d5b9d29ffd7a621f9942", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSAFear(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSAFear", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Fear", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-fear", + "revision": "3c22f7e6bf4e366c86d69293c9164bf9e9d80aac", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSAJealousy(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSAJealousy", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Jealousy", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-jealousy", + "revision": "0d5104ecaa109d2448afe1f40dbf860f5e4458a8", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSASurprise(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSASurprise", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Surprise", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-surprise", + "revision": "62dad66fc2837b0ac5e5175fe7c265d2d502a386", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSALove(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSALove", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Love", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-love", + "revision": "0e000b2f73e9bb74ec8fc6da10011c52725b8469", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSASadness(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSASadness", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Sadness", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-sadness", + "revision": "e9c678325565a5e4dadc43fd6693a8ccff1dd6b2", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSAHappiness(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSAHappiness", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Happiness", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-happiness", + "revision": "e60893b7a8d01c9b8c12fadfe8f0a06e9d548a63", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSAToneChatbotClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSAToneChatbotClassification", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Tone Chatbot Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-tone-chatbot-classification", + "revision": "1f403cfadb85004fbf7e2480334fffc4c999b4ab", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSAToneUserClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSAToneUserClassification", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Tone User", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/chatbot-conversational-sentiment-analysis-tone-user-classification", + "revision": "dd0f76661bef69819cc38c8a455b10af86ac3571", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotSatisfactionLevelClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotSatisfactionLevelClassification", + description="Synthetic Persian Chatbot Satisfaction Level Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-satisfaction-level-classification", + "revision": "e72db473602d750f1bcdc9f0436e1e3b967e088f", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotRAGToneChatbotClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotRAGToneChatbotClassification", + description="Synthetic Persian Chatbot RAG Tone Chatbot Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-rag-tone-chatbot-classification", + "revision": "76f15a203fc13bd98a8f0fdddab1b68c28d7d674", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotRAGToneUserClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotRAGToneUserClassification", + description="Synthetic Persian Chatbot RAG Tone User Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-rag-tone-user-classification", + "revision": "f1f6ad83bb135dc94fbf1ca05c3ba164f5619369", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotToneChatbotClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotToneChatbotClassification", + description="Synthetic Persian Chatbot Tone Chatbot Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-tone-chatbot-classification", + "revision": "a5a739a036fa7bb8ae0be91bc081fdd260d4bdab", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotToneUserClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotToneUserClassification", + description="Synthetic Persian Chatbot Tone User Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-tone-user-classification", + "revision": "780d629437f7be127c6b287a61776372f9f333b9", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class PersianTextTone(AbsTaskClassification): + metadata = TaskMetadata( + name="PersianTextTone", + description="Persian Text Tone", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/persian-text-tone", + "revision": "7144f4c6bdd77911df0dfc5a8bd44dba17e27e3a", + }, + type="Classification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SIDClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SIDClassification", + description="SID Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/sid-classification", + "revision": "29bed651bb980395f5aa473607154d93226945e1", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Academic"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class DeepSentiPers(AbsTaskClassification): + metadata = TaskMetadata( + name="DeepSentiPers", + description="Persian Sentiment Analysis Dataset", + reference="https://github.com/JoyeBright/DeepSentiPers", + dataset={ + "path": "PartAI/DeepSentiPers", + "revision": "ee4f09f404051761cfe14d68127532c82de41cb3", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Reviews"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + def dataset_transform(self): + self.dataset = self.dataset.rename_column("review", "text") + + +class PersianTextEmotion(AbsTaskClassification): + metadata = TaskMetadata( + name="PersianTextEmotion", + description="Emotion is a Persian dataset with six basic emotions: anger, fear, joy, love, sadness, and surprise.", + reference="https://huggingface.co/datasets/SeyedAli/Persian-Text-Emotion", + dataset={ + "path": "SeyedAli/Persian-Text-Emotion", + "revision": "518fcd2c8b89917c7696770672688217a2eabf88", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SentimentDKSF(AbsTaskClassification): + metadata = TaskMetadata( + name="SentimentDKSF", + description="The Sentiment DKSF (Digikala/Snappfood comments) is a dataset for sentiment analysis.", + reference="https://github.com/hezarai/hezar", + dataset={ + "path": "hezarai/sentiment-dksf", + "revision": "b4d5a8dd501db610b5ad89e9aa13f863b842b395", + }, + type="Classification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Reviews"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class NLPTwitterAnalysisClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="NLPTwitterAnalysisClassification", + description="Twitter Analysis Classification", + reference="https://huggingface.co/datasets/hamedhf/nlp_twitter_analysis/tree/main", + dataset={ + "path": "hamedhf/nlp_twitter_analysis", + "revision": "4ceb1312583fd2c7c73ad2d550b726124dcd39a0", + }, + type="Classification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Social"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + def dataset_transform(self): + self.dataset = self.dataset.rename_column("tweet", "text") + + +class DigikalamagClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="DigikalamagClassification", + description="A total of 8,515 articles scraped from Digikala Online Magazine. This dataset includes seven different classes.", + reference="https://hooshvare.github.io/docs/datasets/tc", + dataset={ + "path": "PNLPhub/DigiMag", + "revision": "969b335c9f50eda5c384460be4eb2b55505c2c53", + "trust_remote_code": True, + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + def dataset_transform(self): + self.dataset = self.dataset.rename_column("content", "text") diff --git a/mteb/tasks/Classification/kor/KorFin.py b/mteb/tasks/Classification/kor/KorFin.py index a22b7d5cfe..1fdfb47694 100644 --- a/mteb/tasks/Classification/kor/KorFin.py +++ b/mteb/tasks/Classification/kor/KorFin.py @@ -25,7 +25,7 @@ class KorFin(AbsTaskClassification): "2022-01-01", "2022-12-31", ), # Assumed date based on the citations in the paper - domains=["News", "Written"], + domains=["News", "Written", "Financial"], task_subtypes=["Sentiment/Hate speech"], license="cc-by-sa-4.0", annotations_creators="expert-annotated", diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py index 014796a4cb..65d8b01246 100644 --- a/mteb/tasks/Clustering/__init__.py +++ b/mteb/tasks/Clustering/__init__.py @@ -18,6 +18,9 @@ from .eng.StackExchangeClusteringP2P import * from .eng.TwentyNewsgroupsClustering import * from .eng.WikiCitiesClustering import * +from .eng.WikipediaChemistrySpecialtiesClustering import * +from .eng.WikipediaChemistryTopicsClustering import * +from .fas.FaMTEBClustering import * from .fra.AlloProfClusteringP2P import * from .fra.AlloProfClusteringS2S import * from .fra.HALClusteringS2S import * diff --git a/mteb/tasks/Clustering/eng/ArxivClusteringS2S.py b/mteb/tasks/Clustering/eng/ArxivClusteringS2S.py index c74766061d..8b4beb0e26 100644 --- a/mteb/tasks/Clustering/eng/ArxivClusteringS2S.py +++ b/mteb/tasks/Clustering/eng/ArxivClusteringS2S.py @@ -21,13 +21,13 @@ class ArxivClusteringS2S(AbsTaskClustering): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="v_measure", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + date=("1991-01-01", "2021-01-01"), # 1991-01-01 is the first arxiv paper + domains=["Academic", "Written"], + task_subtypes=[], + license="cc0-1.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@misc{arxiv_org_submitters_2024, title={arXiv Dataset}, url={https://www.kaggle.com/dsv/7548853}, diff --git a/mteb/tasks/Clustering/eng/RedditClustering.py b/mteb/tasks/Clustering/eng/RedditClustering.py index c9efbe954a..84c6602c63 100644 --- a/mteb/tasks/Clustering/eng/RedditClustering.py +++ b/mteb/tasks/Clustering/eng/RedditClustering.py @@ -85,14 +85,13 @@ class RedditClustering(AbsTaskClustering): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="v_measure", - date=None, - form=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + date=("2021-01-01", "2021-04-14"), + domains=["Web", "Social", "Written"], + task_subtypes=["Thematic clustering"], + license="not specified", # derived from pushshift + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@article{geigle:2021:arxiv, author = {Gregor Geigle and Nils Reimers and diff --git a/mteb/tasks/Clustering/eng/RedditClusteringP2P.py b/mteb/tasks/Clustering/eng/RedditClusteringP2P.py index 1e8d51cdfa..fc74844a2e 100644 --- a/mteb/tasks/Clustering/eng/RedditClusteringP2P.py +++ b/mteb/tasks/Clustering/eng/RedditClusteringP2P.py @@ -29,14 +29,13 @@ class RedditClusteringP2P(AbsTaskClustering): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="v_measure", - date=None, - form=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + date=("2021-01-01", "2021-04-14"), + domains=["Web", "Social", "Written"], + task_subtypes=["Thematic clustering"], + license="not specified", # derived from pushshift + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@article{geigle:2021:arxiv, author = {Gregor Geigle and Nils Reimers and diff --git a/mteb/tasks/Clustering/eng/StackExchangeClustering.py b/mteb/tasks/Clustering/eng/StackExchangeClustering.py index b123ab5bd1..c495b10de4 100644 --- a/mteb/tasks/Clustering/eng/StackExchangeClustering.py +++ b/mteb/tasks/Clustering/eng/StackExchangeClustering.py @@ -87,14 +87,13 @@ class StackExchangeClustering(AbsTaskClustering): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="v_measure", - date=None, - form=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + date=("2021-01-01", "2021-04-14"), + domains=["Web", "Written"], + task_subtypes=["Thematic clustering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@article{geigle:2021:arxiv, author = {Gregor Geigle and Nils Reimers and diff --git a/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py b/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py index c411138e9f..a06eb82ae9 100644 --- a/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py +++ b/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py @@ -91,13 +91,13 @@ class StackExchangeClusteringP2P(AbsTaskClustering): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="v_measure", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + date=("2021-01-01", "2021-04-14"), + domains=["Web", "Written"], + task_subtypes=["Thematic clustering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@article{geigle:2021:arxiv, author = {Gregor Geigle and Nils Reimers and diff --git a/mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py b/mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py new file mode 100644 index 0000000000..a4e4082a69 --- /dev/null +++ b/mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaChemistrySpecialtiesClustering(AbsTaskClustering): + metadata = TaskMetadata( + name="WikipediaSpecialtiesInChemistryClustering", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaMedium5Clustering", + "revision": "7754d8d296f9f4c3af1c6426fab36304730ccddf", + }, + type="Clustering", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="v_measure", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py b/mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py new file mode 100644 index 0000000000..bfa5e1fcf3 --- /dev/null +++ b/mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaChemistryTopicsClustering(AbsTaskClustering): + metadata = TaskMetadata( + name="WikipediaChemistryTopicsClustering", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEasy10Clustering", + "revision": "0a0886b06acbfc735bca6a71b21ce1e5cb92a37b", + }, + type="Clustering", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="v_measure", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Clustering/fas/FaMTEBClustering.py b/mteb/tasks/Clustering/fas/FaMTEBClustering.py new file mode 100644 index 0000000000..da0b8b53f3 --- /dev/null +++ b/mteb/tasks/Clustering/fas/FaMTEBClustering.py @@ -0,0 +1,211 @@ +from __future__ import annotations + +import numpy as np +from datasets import Dataset, DatasetDict + +from mteb.abstasks.AbsTaskClusteringFast import ( + AbsTaskClusteringFast, + check_label_distribution, +) +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class BeytooteClustering(AbsTaskClusteringFast): + metadata = TaskMetadata( + name="BeytooteClustering", + description="Beytoote Website Articles Clustering", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/beytoote-clustering", + "revision": "62ca5aecb9414214162569f2f1bfb07aa219a70e", + }, + type="Clustering", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="v_measure", + date=("2024-09-01", "2024-12-31"), + domains=["News"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, + seed=self.seed, + splits=["test"], + label="labels", + ) + + +class DigikalamagClustering(AbsTaskClusteringFast): + metadata = TaskMetadata( + name="DigikalamagClustering", + description="A total of 8,515 articles scraped from Digikala Online Magazine. This dataset includes seven different classes.", + reference="https://hooshvare.github.io/docs/datasets/tc", + dataset={ + "path": "PNLPhub/DigiMag", + "revision": "969b335c9f50eda5c384460be4eb2b55505c2c53", + "trust_remote_code": True, + }, + type="Clustering", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="v_measure", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_columns( + {"label": "labels", "content": "sentences"} + ) + + self.dataset = self.stratified_subsampling( + self.dataset, + seed=self.seed, + splits=["test"], + label="labels", + ) + + +class HamshahriClustring(AbsTaskClusteringFast): + metadata = TaskMetadata( + name="HamshahriClustring", + description="These datasets have been extracted from the RSS feed of two Farsi news agency websites.", + reference="https://github.com/mallahyari/Farsi-datasets", + dataset={ + "path": "community-datasets/farsi_news", + "revision": "ca93dc707cea06cdb2e3ede3b547a1092053aca6", + }, + type="Clustering", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="v_measure", + date=("2024-09-01", "2024-12-31"), + domains=["News"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.dataset.map( + lambda x: {"sentences": f"{x['title']}\n: {x['summary']}"} + ) + self.dataset = self.dataset.map(lambda x: {"labels": x["tags"][0]}) + self.dataset = DatasetDict({"test": self.dataset["hamshahri"]}) + + ds = {} + for split in self.metadata.eval_splits: + labels = self.dataset[split]["labels"] + sentences = self.dataset[split]["sentences"] + + check_label_distribution(self.dataset[split]) + + # Remove sentences and labels with only 1 label example. + unique_labels, counts = np.unique(labels, return_counts=True) + solo_label_idx = np.where(counts == 1) + solo_labels = unique_labels[solo_label_idx] + is_solo = np.isin(labels, solo_labels) + split_ds = Dataset.from_dict({"labels": labels, "sentences": sentences}) + if is_solo.any(): + split_ds = split_ds.select(np.nonzero(is_solo == False)[0]) # noqa: E712 + ds[split] = split_ds + self.dataset = DatasetDict(ds) + + self.dataset = self.stratified_subsampling( + self.dataset, + seed=self.seed, + splits=["test"], + label="labels", + ) + + +class NLPTwitterAnalysisClustering(AbsTaskClusteringFast): + metadata = TaskMetadata( + name="NLPTwitterAnalysisClustering", + description="Clustering of tweets from twitter across 26 categories.", + reference="https://huggingface.co/datasets/hamedhf/nlp_twitter_analysis/commits/main", + dataset={ + "path": "hamedhf/nlp_twitter_analysis", + "revision": "4ceb1312583fd2c7c73ad2d550b726124dcd39a0", + }, + type="Clustering", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="v_measure", + date=("2024-09-01", "2024-12-31"), + domains=["Social"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_column("tweet", "sentences") + self.dataset = self.dataset.rename_column("label", "labels") + self.dataset = self.stratified_subsampling( + self.dataset, + seed=self.seed, + splits=["test"], + label="labels", + ) + + +class SIDClustring(AbsTaskClusteringFast): + metadata = TaskMetadata( + name="SIDClustring", + description="Clustering of summariesfrom SIDClustring across categories.", + reference="https://www.sid.com/", + dataset={ + "path": "MCINext/sid-clustering", + "revision": "d361bb18535d592e845aeaaa8ac47a239aa2f87a", + }, + type="Clustering", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="v_measure", + date=("2024-09-01", "2024-12-31"), + domains=["Academic"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, + seed=self.seed, + splits=["test"], + label="labels", + ) diff --git a/mteb/tasks/Clustering/fas/__init__.py b/mteb/tasks/Clustering/fas/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/CIRRIT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/CIRRIT2IRetrieval.py index 78291c0f37..ed0172ae79 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/CIRRIT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/CIRRIT2IRetrieval.py @@ -34,6 +34,9 @@ class CIRRIT2IRetrieval(AbsTaskAny2AnyRetrieval): pages={2125--2134}, year={2021} }""", + prompt={ + "query": "Retrieve a day-to-day image that aligns with the modification instructions of the provided image." + }, descriptive_stats={ "n_samples": {"test": 4170}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/EDIST2ITRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/EDIST2ITRetrieval.py index e1f8309066..ac7b310998 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/EDIST2ITRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/EDIST2ITRetrieval.py @@ -33,6 +33,7 @@ class EDIST2ITRetrieval(AbsTaskAny2AnyRetrieval): pages={4877--4894}, year={2023} }""", + prompt={"query": "Identify the news photo for the given caption."}, descriptive_stats={ "n_samples": {"test": 3241}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/EncyclopediaVQAIT2ITRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/EncyclopediaVQAIT2ITRetrieval.py index 4e24d13f7d..01f2e6a980 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/EncyclopediaVQAIT2ITRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/EncyclopediaVQAIT2ITRetrieval.py @@ -33,6 +33,9 @@ class EncyclopediaVQAIT2ITRetrieval(AbsTaskAny2AnyRetrieval): pages={3113--3124}, year={2023} }""", + prompt={ + "query": "Obtain illustrated documents that correspond to the inquiry alongside the provided image." + }, descriptive_stats={ "n_samples": {"test": 3743}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kI2TRetrieval.py index cb67b9ad0b..5ba43daf1d 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kI2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kI2TRetrieval.py @@ -33,6 +33,9 @@ class Fashion200kI2TRetrieval(AbsTaskAny2AnyRetrieval): pages={1463--1471}, year={2017} }""", + prompt={ + "query": "Based on the following fashion description, retrieve the best matching image." + }, descriptive_stats={ "n_samples": {"test": 4889}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kT2IRetrieval.py index 2648fb8ee1..1511de7aa4 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kT2IRetrieval.py @@ -34,6 +34,9 @@ class Fashion200kT2IRetrieval(AbsTaskAny2AnyRetrieval): pages={1463--1471}, year={2017} }""", + prompt={ + "query": "Based on the following fashion description, retrieve the best matching image." + }, descriptive_stats={ "n_samples": {"test": 1719}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/FashionIQIT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/FashionIQIT2IRetrieval.py index d6099d1e0c..4e1209c23c 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/FashionIQIT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/FashionIQIT2IRetrieval.py @@ -34,6 +34,9 @@ class FashionIQIT2IRetrieval(AbsTaskAny2AnyRetrieval): pages={11307--11317}, year={2021} }""", + prompt={ + "query": "Find a fashion image that aligns with the reference image and style note." + }, descriptive_stats={ "n_samples": {"test": 6003}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kI2TRetrieval.py index 9818f4de54..43aeea20d4 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kI2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kI2TRetrieval.py @@ -35,6 +35,7 @@ class Flickr30kI2TRetrieval(AbsTaskAny2AnyRetrieval): pages={67-78}, url={https://api.semanticscholar.org/CorpusID:3104920} }""", + prompt={"query": "Find an image caption describing the following image."}, descriptive_stats={ "n_samples": {"test": 1000}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kT2IRetrieval.py index 6e889f9c54..cb87cfcf86 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kT2IRetrieval.py @@ -35,6 +35,7 @@ class Flickr30kT2IRetrieval(AbsTaskAny2AnyRetrieval): pages={67-78}, url={https://api.semanticscholar.org/CorpusID:3104920} }""", + prompt={"query": "Find an image that matches the given caption."}, descriptive_stats={ "n_samples": {"test": 5000}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2ITRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2ITRetrieval.py index 38caf36f1a..f695de1d19 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2ITRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2ITRetrieval.py @@ -34,6 +34,9 @@ class InfoSeekIT2ITRetrieval(AbsTaskAny2AnyRetrieval): pages={14948--14968}, year={2023} }""", + prompt={ + "query": "Find an image and subject description from Wikipedia that answers my question about this image." + }, descriptive_stats={ "n_samples": {"test": 17593}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2TRetrieval.py index ac8861f0aa..e5cecd8591 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2TRetrieval.py @@ -34,6 +34,9 @@ class InfoSeekIT2TRetrieval(AbsTaskAny2AnyRetrieval): pages={14948--14968}, year={2023} }""", + prompt={ + "query": "Find a paragraph from Wikipedia that answers my question about this image." + }, descriptive_stats={ "n_samples": {"test": 11323}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/LLaVAIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/LLaVAIT2TRetrieval.py index e1c4d9ba2a..9a0ded2203 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/LLaVAIT2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/LLaVAIT2TRetrieval.py @@ -44,6 +44,9 @@ class LLaVAIT2TRetrieval(AbsTaskAny2AnyRetrieval): doi = "10.18653/v1/2024.acl-long.289", pages = "5294--5316", }""", + prompt={ + "query": "Provide a specific decription of the image along with the following question." + }, descriptive_stats={ "n_samples": {"test": 5120}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOI2TRetrieval.py index 2e84d22ee7..bc4ce63c72 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOI2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOI2TRetrieval.py @@ -35,6 +35,9 @@ class MSCOCOI2TRetrieval(AbsTaskAny2AnyRetrieval): year={2014}, organization={Springer} }""", + prompt={ + "query": "Find an image caption describing the following everyday image." + }, descriptive_stats={ "n_samples": {"test": 5000}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOT2IRetrieval.py index 1ad8aa7a04..4885e236c2 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOT2IRetrieval.py @@ -35,6 +35,7 @@ class MSCOCOT2IRetrieval(AbsTaskAny2AnyRetrieval): year={2014}, organization={Springer} }""", + prompt={"query": "Identify the image showcasing the described everyday scene."}, descriptive_stats={ "n_samples": {"test": 24809}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/NIGHTSI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/NIGHTSI2IRetrieval.py index aed1805aae..aa05ac6494 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/NIGHTSI2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/NIGHTSI2IRetrieval.py @@ -33,6 +33,9 @@ class NIGHTSI2IRetrieval(AbsTaskAny2AnyRetrieval): volume={36}, year={2024} }""", + prompt={ + "query": "Find a day-to-day image that looks similar to the provided image." + }, descriptive_stats={ "n_samples": {"test": 2120}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/OKVQAIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/OKVQAIT2TRetrieval.py index a072f896e2..65b1c3b202 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/OKVQAIT2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/OKVQAIT2TRetrieval.py @@ -33,6 +33,9 @@ class OKVQAIT2TRetrieval(AbsTaskAny2AnyRetrieval): pages={3195--3204}, year={2019} }""", + prompt={ + "query": "Retrieve documents that provide an answer to the question alongside the image." + }, descriptive_stats={ "n_samples": {"test": 5046}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2ITRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2ITRetrieval.py index f355c43ecf..c6d1ef6baa 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2ITRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2ITRetrieval.py @@ -33,6 +33,9 @@ class OVENIT2ITRetrieval(AbsTaskAny2AnyRetrieval): pages={12065--12075}, year={2023} }""", + prompt={ + "query": "Retrieve a Wikipedia image-description pair that provides evidence for the question of this image." + }, descriptive_stats={ "n_samples": {"test": 14741}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2TRetrieval.py index 308283454c..94898f4819 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2TRetrieval.py @@ -33,6 +33,9 @@ class OVENIT2TRetrieval(AbsTaskAny2AnyRetrieval): pages={12065--12075}, year={2023} }""", + prompt={ + "query": "Retrieve a Wikipedia paragraph that provides an answer to the given query about the image." + }, descriptive_stats={ "n_samples": {"test": 50004}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/ReMuQIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/ReMuQIT2TRetrieval.py index 00577d4ce8..648d2d2e44 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/ReMuQIT2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/ReMuQIT2TRetrieval.py @@ -45,6 +45,9 @@ class ReMuQIT2TRetrieval(AbsTaskAny2AnyRetrieval): doi = "10.18653/v1/2023.acl-long.478", pages = "8573--8589", }""", + prompt={ + "query": "Retrieve a fact-based paragraph that provides an answer to the given query about the image." + }, descriptive_stats={ "n_samples": {"test": 3609}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/VidoreBenchRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/VidoreBenchRetrieval.py index f85e1dadd7..44d0d36cb0 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/VidoreBenchRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/VidoreBenchRetrieval.py @@ -98,6 +98,7 @@ class VidoreArxivQARetrieval(AbsTaskAny2AnyRetrieval): journal={arXiv preprint arXiv:2407.01449}, year={2024} }""", + prompt={"query": "Find a screenshot that relevant to the user's question."}, descriptive_stats={ "n_samples": None, "avg_character_length": { @@ -151,6 +152,7 @@ class VidoreDocVQARetrieval(AbsTaskAny2AnyRetrieval): journal={arXiv preprint arXiv:2407.01449}, year={2024} }""", + prompt={"query": "Find a screenshot that relevant to the user's question."}, descriptive_stats={ "n_samples": None, "avg_character_length": { @@ -204,6 +206,7 @@ class VidoreInfoVQARetrieval(AbsTaskAny2AnyRetrieval): journal={arXiv preprint arXiv:2407.01449}, year={2024} }""", + prompt={"query": "Find a screenshot that relevant to the user's question."}, descriptive_stats={ "n_samples": None, "avg_character_length": { @@ -257,6 +260,7 @@ class VidoreTabfquadRetrieval(AbsTaskAny2AnyRetrieval): journal={arXiv preprint arXiv:2407.01449}, year={2024} }""", + prompt={"query": "Find a screenshot that relevant to the user's question."}, descriptive_stats={ "n_samples": None, "avg_character_length": { @@ -310,6 +314,7 @@ class VidoreTatdqaRetrieval(AbsTaskAny2AnyRetrieval): journal={arXiv preprint arXiv:2407.01449}, year={2024} }""", + prompt={"query": "Find a screenshot that relevant to the user's question."}, descriptive_stats={ "n_samples": None, "avg_character_length": { @@ -363,6 +368,7 @@ class VidoreShiftProjectRetrieval(AbsTaskAny2AnyRetrieval): journal={arXiv preprint arXiv:2407.01449}, year={2024} }""", + prompt={"query": "Find a screenshot that relevant to the user's question."}, descriptive_stats={ "n_samples": None, "avg_character_length": { @@ -416,6 +422,7 @@ class VidoreSyntheticDocQAAIRetrieval(AbsTaskAny2AnyRetrieval): journal={arXiv preprint arXiv:2407.01449}, year={2024} }""", + prompt={"query": "Find a screenshot that relevant to the user's question."}, descriptive_stats={ "n_samples": None, "avg_character_length": { @@ -469,6 +476,7 @@ class VidoreSyntheticDocQAEnergyRetrieval(AbsTaskAny2AnyRetrieval): journal={arXiv preprint arXiv:2407.01449}, year={2024} }""", + prompt={"query": "Find a screenshot that relevant to the user's question."}, descriptive_stats={ "n_samples": None, "avg_character_length": { @@ -522,6 +530,7 @@ class VidoreSyntheticDocQAGovernmentReportsRetrieval(AbsTaskAny2AnyRetrieval): journal={arXiv preprint arXiv:2407.01449}, year={2024} }""", + prompt={"query": "Find a screenshot that relevant to the user's question."}, descriptive_stats={ "n_samples": None, "avg_character_length": { @@ -575,6 +584,7 @@ class VidoreSyntheticDocQAHealthcareIndustryRetrieval(AbsTaskAny2AnyRetrieval): journal={arXiv preprint arXiv:2407.01449}, year={2024} }""", + prompt={"query": "Find a screenshot that relevant to the user's question."}, descriptive_stats={ "n_samples": None, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsI2TRetrieval.py index ecce9f1e9a..2f79bfe9eb 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsI2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsI2TRetrieval.py @@ -33,6 +33,7 @@ class VisualNewsI2TRetrieval(AbsTaskAny2AnyRetrieval): pages={6761--6771}, year={2021} }""", + prompt={"query": "Find a caption for the news in the given photo."}, descriptive_stats={ "n_samples": {"test": 20000}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsT2IRetrieval.py index c700a5ab3c..1c5fa7fdbe 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsT2IRetrieval.py @@ -33,6 +33,9 @@ class VisualNewsT2IRetrieval(AbsTaskAny2AnyRetrieval): pages={6761--6771}, year={2021} }""", + prompt={ + "query": "Identify the news-related image in line with the described event." + }, descriptive_stats={ "n_samples": {"test": 19995}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2ITRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2ITRetrieval.py index 583cae54dd..e3235c4912 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2ITRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2ITRetrieval.py @@ -33,6 +33,7 @@ class WebQAT2ITRetrieval(AbsTaskAny2AnyRetrieval): pages={16495--16504}, year={2022} }""", + prompt={"query": "Find a Wikipedia image that answers this question."}, descriptive_stats={ "n_samples": {"test": 2511}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2TRetrieval.py index eddc6e0fc4..4583e61221 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2TRetrieval.py @@ -33,6 +33,9 @@ class WebQAT2TRetrieval(AbsTaskAny2AnyRetrieval): pages={16495--16504}, year={2022} }""", + prompt={ + "query": "Retrieve passages from Wikipedia that provide answers to the following question." + }, descriptive_stats={ "n_samples": {"test": 2455}, "avg_character_length": { diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index c2057a4952..6cd75ea144 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -4,15 +4,21 @@ from .ces.CTKFactsNLI import * from .deu.FalseFriendsDeEnPC import * from .eng.LegalBenchPC import * +from .eng.PubChemAISentenceParaphrasePC import * +from .eng.PubChemSMILESPC import * +from .eng.PubChemSynonymPC import * +from .eng.PubChemWikiParagraphsPC import * from .eng.SprintDuplicateQuestionsPC import * from .eng.TwitterSemEval2015PC import * from .eng.TwitterURLCorpusPC import * +from .fas.FaMTEBPairClassification import * from .fas.FarsTail import * from .hye.ArmenianParaphrasePC import * from .ind.IndoNLI import * from .kor.KlueNLI import * from .multilingual.OpusparcusPC import * from .multilingual.PawsXPairClassification import * +from .multilingual.PubChemWikiPairClassification import * from .multilingual.RTE3 import * from .multilingual.XNLI import * from .multilingual.XStance import * diff --git a/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py b/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py new file mode 100644 index 0000000000..f453ebee31 --- /dev/null +++ b/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class PubChemAISentenceParaphrasePC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="PubChemAISentenceParaphrasePC", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/PubChemAISentenceParaphrasePC", + "revision": "f33a205966ce032f957c3a22f4f9e378f89a2c56", + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="max_ap", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="LM-generated", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + @article{kim2023pubchem, + title={PubChem 2023 update}, + author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal={Nucleic acids research}, + volume={51}, + number={D1}, + pages={D1373--D1380}, + year={2023}, + publisher={Oxford University Press} + } + """, + ) + + def dataset_transform(self): + _dataset = {} + for split in self.metadata.eval_splits: + hf_dataset = self.dataset[split] + _dataset[split] = [ + { + "sentence1": hf_dataset["sent1"], + "sentence2": hf_dataset["sent2"], + "labels": hf_dataset["labels"], + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py new file mode 100644 index 0000000000..b3e297e043 --- /dev/null +++ b/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +import datasets + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + +_DATASET_COLUMN_MAP = [ + { + "name": "iso-desc", + "sent1": "description", + "sent2": "isomeric_smiles", + "labels": "labels", + }, + { + "name": "iso-title", + "sent1": "title", + "sent2": "isomeric_smiles", + "labels": "labels", + }, + { + "name": "canon-desc", + "sent1": "description", + "sent2": "canonical_smiles", + "labels": "labels", + }, + { + "name": "canon-title", + "sent1": "title", + "sent2": "canonical_smiles", + "labels": "labels", + }, +] + + +class PubChemSMILESPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="PubChemSMILESPC", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/PubChemSMILESPairClassification", + "revision": "7ba40b69f5fe6ffe4cc189aac9e1710913c73c8a", + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="max_ap", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + @article{kim2023pubchem, + title={PubChem 2023 update}, + author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal={Nucleic acids research}, + volume={51}, + number={D1}, + pages={D1373--D1380}, + year={2023}, + publisher={Oxford University Press} + } + """, + ) + + def load_data(self): + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + + _hf_dataset = None + for dataset_col_map in _DATASET_COLUMN_MAP: + _dataset = datasets.load_dataset( + self.metadata.dataset["path"], + dataset_col_map["name"], + revision=self.metadata.dataset["revision"], + ) + + _dataset = _dataset.rename_columns( + { + dataset_col_map["sent1"]: "sentence1", + dataset_col_map["sent2"]: "sentence2", + dataset_col_map["labels"]: "labels", + } + ) + + if _hf_dataset is None: + _hf_dataset = _dataset + else: + _hf_dataset["test"] = datasets.concatenate_datasets( + [_hf_dataset["test"], _dataset["test"]] + ) + + self.dataset = _hf_dataset + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, + seed=self.seed, + splits=self.metadata["eval_splits"], + label="labels", + ) + + _dataset = {} + for split in self.metadata.eval_splits: + hf_dataset = self.dataset[split] + _dataset[split] = [ + { + "sentence1": hf_dataset["sentence1"], + "sentence2": hf_dataset["sentence2"], + "labels": hf_dataset["labels"], + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py b/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py new file mode 100644 index 0000000000..6b6dfd81c8 --- /dev/null +++ b/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class PubChemSynonymPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="PubChemSynonymPC", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/PubChemSynonymPC", + "revision": "5037d69d177c9628fb79cb57eea1299178b28c1b", + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="max_ap", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + @article{kim2023pubchem, + title={PubChem 2023 update}, + author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal={Nucleic acids research}, + volume={51}, + number={D1}, + pages={D1373--D1380}, + year={2023}, + publisher={Oxford University Press} + } + """, + ) + + def dataset_transform(self): + _dataset = {} + + for split in self.metadata.eval_splits: + hf_dataset = self.dataset[split] + _dataset[split] = [ + { + "sentence1": hf_dataset["title"], + "sentence2": hf_dataset["synonyms"], + "labels": hf_dataset["labels"], + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py b/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py new file mode 100644 index 0000000000..679580f28c --- /dev/null +++ b/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class PubChemWikiParagraphsPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="PubChemWikiParagraphsPC", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/PubChemWikiParagraphsPC", + "revision": "7fb14716e4106b72f51a16e682e5cd2d67e9bd70", + }, + type="PairClassification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="max_ap", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + @article{kim2023pubchem, + title={PubChem 2023 update}, + author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal={Nucleic acids research}, + volume={51}, + number={D1}, + pages={D1373--D1380}, + year={2023}, + publisher={Oxford University Press} + } + """, + ) + + def dataset_transform(self): + _dataset = {} + for split in self.metadata.eval_splits: + hf_dataset = self.dataset[split] + _dataset[split] = [ + { + "sentence1": hf_dataset["sent1"], + "sentence2": hf_dataset["sent2"], + "labels": hf_dataset["labels"], + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/TwitterSemEval2015PC.py b/mteb/tasks/PairClassification/eng/TwitterSemEval2015PC.py index b8bc686d87..9da7c1072e 100644 --- a/mteb/tasks/PairClassification/eng/TwitterSemEval2015PC.py +++ b/mteb/tasks/PairClassification/eng/TwitterSemEval2015PC.py @@ -21,12 +21,12 @@ class TwitterSemEval2015PC(AbsTaskPairClassification): eval_langs=["eng-Latn"], main_score="max_ap", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Social", "Written"], + task_subtypes=[], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{xu-etal-2015-semeval, title = "{S}em{E}val-2015 Task 1: Paraphrase and Semantic Similarity in {T}witter ({PIT})", author = "Xu, Wei and diff --git a/mteb/tasks/PairClassification/eng/TwitterURLCorpusPC.py b/mteb/tasks/PairClassification/eng/TwitterURLCorpusPC.py index 24839e5938..85432b1d97 100644 --- a/mteb/tasks/PairClassification/eng/TwitterURLCorpusPC.py +++ b/mteb/tasks/PairClassification/eng/TwitterURLCorpusPC.py @@ -21,12 +21,12 @@ class TwitterURLCorpusPC(AbsTaskPairClassification): eval_langs=["eng-Latn"], main_score="max_ap", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Social", "Written"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{lan-etal-2017-continuously, title = "A Continuously Growing Dataset of Sentential Paraphrases", author = "Lan, Wuwei and diff --git a/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py b/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py new file mode 100644 index 0000000000..6deba76d8d --- /dev/null +++ b/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py @@ -0,0 +1,282 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CExaPPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="CExaPPC", + description="ExaPPC is a large paraphrase corpus consisting of monolingual sentence-level paraphrases using different sources.", + reference="https://github.com/exaco/exappc", + dataset={ + "path": "PNLPhub/C-ExaPPC", + "revision": "68a0ff474739367a36c8066ee04802a65aefc117", + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="max_ap", + date=("2024-09-01", "2024-12-31"), + domains=["Social", "Web"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + _dataset = {} + self.dataset = self.dataset.map( + lambda example: {"label": 1 if example["label"] == "paraphrase" else 0} + ) + for split in self.metadata.eval_splits: + _dataset[split] = [ + { + "sentence1": self.dataset[split]["sentence1"], + "sentence2": self.dataset[split]["sentence2"], + "labels": self.dataset[split]["label"], + } + ] + self.dataset = _dataset + + +class SynPerChatbotRAGFAQPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="SynPerChatbotRAGFAQPC", + description="Synthetic Persian Chatbot RAG FAQ Pair Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-rag-faq-pair-classification", + "revision": "2128d809e27ab8528906e2231f8e824516fb8e5a", + }, + type="PairClassification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="max_ap", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + _dataset = {} + for split in self.metadata.eval_splits: + _dataset[split] = [ + { + "sentence1": self.dataset[split]["sent1"][0], + "sentence2": self.dataset[split]["sent2"][0], + "labels": self.dataset[split]["labels"][0], + } + ] + self.dataset = _dataset + + +class FarsiParaphraseDetection(AbsTaskPairClassification): + metadata = TaskMetadata( + name="FarsiParaphraseDetection", + description="Farsi Paraphrase Detection", + reference="https://huggingface.co/datasets/alighasemi/farsi_paraphrase_detection", + dataset={ + "path": "alighasemi/farsi_paraphrase_detection", + "revision": "c8129741af418d9ae43cfc1fc4f285704e26035f", + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="max_ap", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + _dataset = {} + for split in self.metadata.eval_splits: + _dataset[split] = [ + { + "sentence1": self.dataset[split]["sentence1"], + "sentence2": self.dataset[split]["sentence2"], + "labels": self.dataset[split]["label"], + } + ] + self.dataset = _dataset + + +class SynPerTextKeywordsPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="SynPerTextKeywordsPC", + description="Synthetic Persian Text Keywords Pair Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-text-keyword-pair-classification", + "revision": "ea9a840cb163b415cc70b2f7adf2554feae159dc", + }, + type="PairClassification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="max_ap", + date=("2024-09-01", "2024-12-31"), + domains=["Web", "News", "Religious", "Blog"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + _dataset = {} + for split in self.metadata.eval_splits: + _dataset[split] = [ + { + "sentence1": self.dataset[split]["sent1"][0], + "sentence2": self.dataset[split]["sent2"][0], + "labels": self.dataset[split]["labels"][0], + } + ] + self.dataset = _dataset + + +class SynPerQAPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="SynPerQAPC", + description="Synthetic Persian QA Pair Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-qa-pair-classification", + "revision": "d1b62ef31bebbb48ae01867993a1e583c2ce7d93", + }, + type="PairClassification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="max_ap", + date=("2024-09-01", "2024-12-31"), + domains=["Web", "News", "Religious", "Blog"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + _dataset = {} + for split in self.metadata.eval_splits: + _dataset[split] = [ + { + "sentence1": self.dataset[split]["sent1"][0], + "sentence2": self.dataset[split]["sent2"][0], + "labels": self.dataset[split]["labels"][0], + } + ] + self.dataset = _dataset + + +class ParsinluEntail(AbsTaskPairClassification): + metadata = TaskMetadata( + name="ParsinluEntail", + description="A Persian textual entailment task (deciding sent1 entails sent2). The questions are partially translated from the SNLI dataset and partially generated by expert annotators.", + reference="https://github.com/persiannlp/parsinlu", + dataset={ + "path": "persiannlp/parsinlu_entailment", + "revision": "c49b2d8fa0d6476520695c52207690b7ec854043", + "trust_remote_code": True, + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="max_ap", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + _dataset = {} + self.dataset = self.dataset.filter(lambda x: x["label"] != "n") + self.dataset = self.dataset.map( + lambda example: {"label": 1 if example["label"] == "e" else 0} + ) + for split in self.metadata.eval_splits: + _dataset[split] = [ + { + "sentence1": self.dataset[split]["sent1"], + "sentence2": self.dataset[split]["sent2"], + "labels": self.dataset[split]["label"], + } + ] + self.dataset = _dataset + + +class ParsinluQueryParaphPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="ParsinluQueryParaphPC", + description="A Persian query paraphrasng task (deciding whether two questions are paraphrases of each other). The questions are partially generated from Google auto-complete, and partially translated from the Quora paraphrasing dataset.", + reference="https://huggingface.co/datasets/persiannlp/parsinlu_query_paraphrasing", + dataset={ + "path": "persiannlp/parsinlu_query_paraphrasing", + "revision": "ec675bb3ac50c1a52317c101fe1d724b4601f47a", + "trust_remote_code": True, + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="max_ap", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + _dataset = {} + self.dataset = self.dataset.map( + lambda example: {"label": 1 if example["label"] == "1" else 0} + ) + for split in self.metadata.eval_splits: + _dataset[split] = [ + { + "sentence1": self.dataset[split]["q1"], + "sentence2": self.dataset[split]["q2"], + "labels": self.dataset[split]["label"], + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py b/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py new file mode 100644 index 0000000000..59a0605a82 --- /dev/null +++ b/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +_LANGUAGES = { + "de": ["deu-Latn", "eng-Latn"], + "nl": ["nld-Latn", "eng-Latn"], + "zh": ["zho-Hans", "eng-Latn"], + "fr": ["fra-Latn", "eng-Latn"], + "es": ["spa-Latn", "eng-Latn"], + "pt": ["por-Latn", "eng-Latn"], + "ms": ["msa-Latn", "eng-Latn"], + "ko": ["kor-Hang", "eng-Latn"], + "tr": ["tur-Latn", "eng-Latn"], + "hi": ["hin-Deva", "eng-Latn"], + "cs": ["ces-Latn", "eng-Latn"], + "ja": ["jpn-Jpan", "eng-Latn"], +} + + +class PubChemWikiPairClassification(AbsTaskPairClassification, MultilingualTask): + metadata = TaskMetadata( + name="PubChemWikiPairClassification", + dataset={ + "path": "BASF-AI/PubChemWikiMultilingualPC", + "revision": "3412b208896a37e4ebb5ff7b96f6cc313ee9d2e3", + }, + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + category="s2s", + modalities=["text"], + type="PairClassification", + eval_splits=["test"], + eval_langs=_LANGUAGES, + main_score="max_ap", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + @article{kim2023pubchem, + title={PubChem 2023 update}, + author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal={Nucleic acids research}, + volume={51}, + number={D1}, + pages={D1373--D1380}, + year={2023}, + publisher={Oxford University Press} + } + """, + ) + + def dataset_transform(self) -> None: + _dataset = {} + for lang in self.hf_subsets: + _dataset[lang] = {} + hf_dataset = self.dataset[lang][self.metadata.eval_splits[0]] + _dataset[lang]["test"] = [ + { + "sentence1": hf_dataset["sent1"], + "sentence2": hf_dataset["sent2"], + "labels": hf_dataset["labels"], + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py b/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py index 90fe689cdd..b9dfde0055 100644 --- a/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py +++ b/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py @@ -21,12 +21,12 @@ class AskUbuntuDupQuestions(AbsTaskReranking): eval_langs=["eng-Latn"], main_score="map", date=None, - domains=None, + domains=["Programming", "Web"], task_subtypes=None, license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", prompt="Retrieve duplicate questions from AskUbuntu forum", bibtex_citation="""@article{wang-2021-TSDAE, title = "TSDAE: Using Transformer-based Sequential Denoising Auto-Encoderfor Unsupervised Sentence Embedding Learning", diff --git a/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py b/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py index 9e47461620..897f9d7bc9 100644 --- a/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py +++ b/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py @@ -20,13 +20,13 @@ class StackOverflowDupQuestions(AbsTaskReranking): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="map", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + date=("2014-01-21", "2018-01-01"), + domains=["Written", "Blog", "Programming"], + task_subtypes=["Question answering"], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", prompt="Retrieve duplicate questions from StackOverflow forum", bibtex_citation="""@article{Liu2018LinkSOAD, title={LinkSO: a dataset for learning to retrieve similar question answer pairs on software development forums}, diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index d83df7ec5e..06414da081 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -5,6 +5,7 @@ from .code.CodeEditSearchRetrieval import * from .code.CodeFeedbackMTRetrieval import * from .code.CodeFeedbackSTRetrieval import * +from .code.CodeRAG import * from .code.CodeSearchNetCCRetrieval import * from .code.CodeSearchNetRetrieval import * from .code.CodeTransOceanContestRetrieval import * @@ -29,6 +30,8 @@ from .eng.ARCChallengeRetrieval import * from .eng.ArguAnaRetrieval import * from .eng.BrightRetrieval import * +from .eng.ChemHotpotQARetrieval import * +from .eng.ChemNQRetrieval import * from .eng.ClimateFEVERRetrieval import * from .eng.CQADupstackAndroidRetrieval import * from .eng.CQADupstackEnglishRetrieval import * @@ -101,6 +104,8 @@ from .eng.TRECCOVIDRetrieval import * from .eng.WinoGrandeRetrieval import * from .est.estqa import * +from .fas.BEIRFa import * +from .fas.FaMTEBRetrieval import * from .fra.AlloprofRetrieval import * from .fra.BSARDRetrieval import * from .fra.FQuADRetrieval import * diff --git a/mteb/tasks/Retrieval/code/CodeRAG.py b/mteb/tasks/Retrieval/code/CodeRAG.py new file mode 100644 index 0000000000..3724f44eca --- /dev/null +++ b/mteb/tasks/Retrieval/code/CodeRAG.py @@ -0,0 +1,272 @@ +from __future__ import annotations + +import datasets + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +def split_by_first_newline(s): + # Split the string by the first newline + parts = s.split("\n", 1) + # Return parts or (s, '') if no newline + return parts if len(parts) > 1 else (s, "") + + +common_args = { + "reference": "https://arxiv.org/pdf/2406.14497", + "type": "Reranking", + "category": "s2s", + "modalities": ["text"], + "eval_splits": ["train"], + "eval_langs": ["python-Code"], + "main_score": "ndcg_at_10", + "date": ("2024-06-02", "2024-06-02"), # best guess + "domains": ["Programming"], + "task_subtypes": ["Code retrieval"], + "license": "cc-by-sa-4.0", + "annotations_creators": "derived", + "dialect": [], + "sample_creation": "found", + "bibtex_citation": """ + @misc{wang2024coderagbenchretrievalaugmentcode, + title={CodeRAG-Bench: Can Retrieval Augment Code Generation?}, + author={Zora Zhiruo Wang and Akari Asai and Xinyan Velocity Yu and Frank F. Xu and Yiqing Xie and Graham Neubig and Daniel Fried}, + year={2024}, + eprint={2406.14497}, + archivePrefix={arXiv}, + primaryClass={cs.SE}, + url={https://arxiv.org/abs/2406.14497}, + } + """, +} + + +class CodeRAGProgrammingSolutionsRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CodeRAGProgrammingSolutions", + description="Evaluation of programming solution retrieval using CodeRAG-Bench. Tests the ability to retrieve relevant programming solutions given code-related queries.", + dataset={ + "path": "code-rag-bench/programming-solutions", + "revision": "1064f7bba54d5400d4836f5831fe4c2332a566a6", + }, + **common_args, # type: ignore + ) + + def load_data(self, **kwargs): + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self) -> None: + """And transform to a retrieval datset, which have the following attributes + + self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text + self.queries = Dict[query_id, str] #id => query + self.relevant_docs = Dict[query_id, Dict[[doc_id, score]] + """ + self.corpus = {} + self.relevant_docs = {} + self.queries = {} + + split = self.metadata.eval_splits[0] + ds: datasets.Dataset = self.dataset[split] # type: ignore + ds = ds.shuffle(seed=42) + + self.queries[split] = {} + self.relevant_docs[split] = {} + self.corpus[split] = {} + + texts = ds["text"] + meta = ds["meta"] + for text, mt in zip(texts, meta): + # in code-rag-bench, + # text = query + "\n" + doc(code) + query, doc = split_by_first_newline(text) + + id = mt["task_id"] + + query_id = id + doc_id = f"doc_{id}" + self.queries[split][query_id] = query + self.corpus[split][doc_id] = {"title": "", "text": doc} + + self.relevant_docs[split][query_id] = { + doc_id: 1 + } # only one correct matches + + +class CodeRAGOnlineTutorialsRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CodeRAGOnlineTutorials", + description="Evaluation of online programming tutorial retrieval using CodeRAG-Bench. Tests the ability to retrieve relevant tutorials from online platforms given code-related queries.", + dataset={ + "path": "code-rag-bench/online-tutorials", + "revision": "095bb77130082e4690d6c3a031997b03487bf6e2", + }, + **common_args, # type: ignore + ) + + def load_data(self, **kwargs): + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self) -> None: + """And transform to a retrieval datset, which have the following attributes + + self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text + self.queries = Dict[query_id, str] #id => query + self.relevant_docs = Dict[query_id, Dict[[doc_id, score]] + """ + self.corpus = {} + self.relevant_docs = {} + self.queries = {} + + split = self.metadata.eval_splits[0] + ds: datasets.Dataset = self.dataset[split] # type: ignore + ds = ds.shuffle(seed=42) + + self.queries[split] = {} + self.relevant_docs[split] = {} + self.corpus[split] = {} + + titles = ds["title"] + texts = ds["text"] + parsed = ds["parsed"] + id = 0 + for title, text, mt in zip(titles, texts, parsed): + # in code-rag-bench, + # query=doc(code) + # text=query+doc(code) + query, doc = title, text + + query_id = str(id) + doc_id = f"doc_{id}" + self.queries[split][query_id] = query + self.corpus[split][doc_id] = {"title": "", "text": doc} + + self.relevant_docs[split][query_id] = { + doc_id: 1 + } # only one correct matches + + id += 1 + + +class CodeRAGLibraryDocumentationSolutionsRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CodeRAGLibraryDocumentationSolutions", + description="Evaluation of code library documentation retrieval using CodeRAG-Bench. Tests the ability to retrieve relevant Python library documentation sections given code-related queries.", + dataset={ + "path": "code-rag-bench/library-documentation", + "revision": "b530d3b5a25087d2074e731b76232db85b9e9107", + }, + **common_args, # type: ignore + ) + + def load_data(self, **kwargs): + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self) -> None: + """And transform to a retrieval datset, which have the following attributes + + self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text + self.queries = Dict[query_id, str] #id => query + self.relevant_docs = Dict[query_id, Dict[[doc_id, score]] + """ + self.corpus = {} + self.relevant_docs = {} + self.queries = {} + + split = self.metadata.eval_splits[0] + ds: datasets.Dataset = self.dataset[split] # type: ignore + ds = ds.shuffle(seed=42) + + self.queries[split] = {} + self.relevant_docs[split] = {} + self.corpus[split] = {} + + texts = ds["doc_content"] + + id = 0 + for text in texts: + # text format "document title \n document content" + query, doc = split_by_first_newline(text) + + # some library documents doesn't have query-doc pair + if not doc: + continue + query_id = str(id) + doc_id = f"doc_{id}" + self.queries[split][query_id] = query + self.corpus[split][doc_id] = {"title": "", "text": doc} + # only one correct match + self.relevant_docs[split][query_id] = {doc_id: 1} + id += 1 + + +class CodeRAGStackoverflowPostsRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CodeRAGStackoverflowPosts", + description="Evaluation of StackOverflow post retrieval using CodeRAG-Bench. Tests the ability to retrieve relevant StackOverflow posts given code-related queries.", + dataset={ + "path": "code-rag-bench/stackoverflow-posts", + "revision": "04e05d86cb0ac467b29a5d87f4c56eac99dfc0a4", + }, + **common_args, # type: ignore + ) + + def load_data(self, **kwargs): + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self) -> None: + """And transform to a retrieval datset, which have the following attributes + + self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text + self.queries = Dict[query_id, str] #id => query + self.relevant_docs = Dict[query_id, Dict[[doc_id, score]] + """ + self.corpus = {} + self.relevant_docs = {} + self.queries = {} + + split = self.metadata.eval_splits[0] + ds: datasets.Dataset = self.dataset[split] # type: ignore + ds = ds.shuffle(seed=42) + + self.queries[split] = {} + self.relevant_docs[split] = {} + self.corpus[split] = {} + + texts = ds["text"] + id = 0 + for text in texts: + # in code-rag-bench, + # text = query + "\n" + doc + query, doc = split_by_first_newline(text) + + query_id = str(id) + doc_id = f"doc_{id}" + self.queries[split][query_id] = query + self.corpus[split][doc_id] = {"title": "", "text": doc} + + self.relevant_docs[split][query_id] = { + doc_id: 1 + } # only one correct matches + id += 1 diff --git a/mteb/tasks/Retrieval/eng/CQADupstackAndroidRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackAndroidRetrieval.py index b95c61af47..156395a077 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackAndroidRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackAndroidRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackAndroidRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Programming", "Web", "Written", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackEnglishRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackEnglishRetrieval.py index d9f1c1f344..af47eda5c4 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackEnglishRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackEnglishRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackEnglishRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written"], + task_subtypes=["Question answering", "Duplicate Detection"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackGamingRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackGamingRetrieval.py index 8c89299957..b51a3e64b5 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackGamingRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackGamingRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackGamingRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Web", "Written"], + task_subtypes=["Question answering", "Duplicate Detection"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackGisRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackGisRetrieval.py index 8ed296b003..da38284f2d 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackGisRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackGisRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackGisRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackMathematicaRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackMathematicaRetrieval.py index 0d1804e5e7..b29d166129 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackMathematicaRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackMathematicaRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackMathematicaRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Academic", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackPhysicsRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackPhysicsRetrieval.py index 77402252f9..3dd0fdc4a5 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackPhysicsRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackPhysicsRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackPhysicsRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Academic", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackProgrammersRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackProgrammersRetrieval.py index 1fa63dd20a..f84b1b17e4 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackProgrammersRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackProgrammersRetrieval.py @@ -23,7 +23,7 @@ class CQADupstackProgrammersRetrieval(AbsTaskRetrieval): date=None, domains=["Programming", "Written", "Non-fiction"], task_subtypes=[], - license="cc-by-sa-4.0", + license="apache-2.0", annotations_creators="derived", dialect=[], sample_creation="found", diff --git a/mteb/tasks/Retrieval/eng/CQADupstackStatsRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackStatsRetrieval.py index 8b2ee5950a..1fd18f8d84 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackStatsRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackStatsRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackStatsRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Academic", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackTexRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackTexRetrieval.py index 2e87f49710..c4447442be 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackTexRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackTexRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackTexRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackUnixRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackUnixRetrieval.py index f86d886519..57c9964b15 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackUnixRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackUnixRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackUnixRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Web", "Programming"], + task_subtypes=["Question answering", "Duplicate Detection"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackWebmastersRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackWebmastersRetrieval.py index eedacec19a..2e9bd63e08 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackWebmastersRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackWebmastersRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackWebmastersRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Web"], + task_subtypes=["Question answering"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackWordpressRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackWordpressRetrieval.py index e70255c371..3b11866f82 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackWordpressRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackWordpressRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackWordpressRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Web", "Programming"], + task_subtypes=["Question answering"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py b/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py new file mode 100644 index 0000000000..88fbc50df4 --- /dev/null +++ b/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class ChemHotpotQARetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="ChemHotpotQARetrieval", + dataset={ + "path": "BASF-AI/ChemHotpotQARetrieval", + "revision": "1840e8a5ac6ec752bbdd97d543ead0189bc7c25b", + }, + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["train", "dev", "test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + @inproceedings{yang-etal-2018-hotpotqa, + title = "{H}otpot{QA}: A Dataset for Diverse, Explainable Multi-hop Question Answering", + author = "Yang, Zhilin and + Qi, Peng and + Zhang, Saizheng and + Bengio, Yoshua and + Cohen, William and + Salakhutdinov, Ruslan and + Manning, Christopher D.", + editor = "Riloff, Ellen and + Chiang, David and + Hockenmaier, Julia and + Tsujii, Jun{'}ichi", + booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing", + month = oct # "-" # nov, + year = "2018", + address = "Brussels, Belgium", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/D18-1259", + doi = "10.18653/v1/D18-1259", + pages = "2369--2380", + abstract = "Existing question answering (QA) datasets fail to train QA systems to perform complex reasoning and provide explanations for answers. We introduce HotpotQA, a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowing QA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems{'} ability to extract relevant facts and perform necessary comparison. We show that HotpotQA is challenging for the latest QA systems, and the supporting facts enable models to improve performance and make explainable predictions.", + } +""", + ) diff --git a/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py b/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py new file mode 100644 index 0000000000..1e77971331 --- /dev/null +++ b/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class ChemNQRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="ChemNQRetrieval", + dataset={ + "path": "BASF-AI/ChemNQRetrieval", + "revision": "5d958fb6b31055495347724d46431ba41309b03a", + }, + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + @article{47761, + title = {Natural Questions: a Benchmark for Question Answering Research}, + author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh + and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee + and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le + and Slav Petrov}, + year = {2019}, + journal = {Transactions of the Association of Computational Linguistics}} + """, + ) diff --git a/mteb/tasks/Retrieval/eng/ClimateFEVERRetrieval.py b/mteb/tasks/Retrieval/eng/ClimateFEVERRetrieval.py index d60b7a3817..b87e5223e0 100644 --- a/mteb/tasks/Retrieval/eng/ClimateFEVERRetrieval.py +++ b/mteb/tasks/Retrieval/eng/ClimateFEVERRetrieval.py @@ -21,12 +21,12 @@ class ClimateFEVER(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Encyclopaedic", "Written"], + task_subtypes=["Claim verification"], + license="cc-by-sa-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", bibtex_citation="""@misc{diggelmann2021climatefever, title={CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims}, author={Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold}, @@ -57,12 +57,12 @@ class ClimateFEVERHardNegatives(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Encyclopaedic", "Written"], + task_subtypes=["Claim verification"], + license="cc-by-sa-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", bibtex_citation="""@misc{diggelmann2021climatefever, title={CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims}, author={Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold}, diff --git a/mteb/tasks/Retrieval/eng/FEVERRetrieval.py b/mteb/tasks/Retrieval/eng/FEVERRetrieval.py index 776fd2fbe6..fff60a54d2 100644 --- a/mteb/tasks/Retrieval/eng/FEVERRetrieval.py +++ b/mteb/tasks/Retrieval/eng/FEVERRetrieval.py @@ -27,12 +27,12 @@ class FEVER(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Encyclopaedic", "Written"], + task_subtypes=["Claim verification"], + license="cc-by-nc-sa-3.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{thorne-etal-2018-fever, title = "{FEVER}: a Large-scale Dataset for Fact Extraction and {VER}ification", author = "Thorne, James and diff --git a/mteb/tasks/Retrieval/eng/FiQA2018Retrieval.py b/mteb/tasks/Retrieval/eng/FiQA2018Retrieval.py index 1489cd168c..7a99d48a95 100644 --- a/mteb/tasks/Retrieval/eng/FiQA2018Retrieval.py +++ b/mteb/tasks/Retrieval/eng/FiQA2018Retrieval.py @@ -23,12 +23,12 @@ class FiQA2018(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Financial"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{ thakur2021beir, title={{BEIR}: A Heterogeneous Benchmark for Zero-shot Evaluation of Information Retrieval Models}, diff --git a/mteb/tasks/Retrieval/eng/MSMARCORetrieval.py b/mteb/tasks/Retrieval/eng/MSMARCORetrieval.py index 5ada0cf887..6ebb5d7277 100644 --- a/mteb/tasks/Retrieval/eng/MSMARCORetrieval.py +++ b/mteb/tasks/Retrieval/eng/MSMARCORetrieval.py @@ -23,12 +23,23 @@ class MSMARCO(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=[ + "Encyclopaedic", + "Academic", + "Blog", + "News", + "Medical", + "Government", + "Reviews", + "Non-fiction", + "Social", + "Web", + ], + task_subtypes=["Question answering"], + license="msr-la-nc", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@article{DBLP:journals/corr/NguyenRSGTMD16, author = {Tri Nguyen and Mir Rosenberg and @@ -73,12 +84,23 @@ class MSMARCOHardNegatives(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=[ + "Encyclopaedic", + "Academic", + "Blog", + "News", + "Medical", + "Government", + "Reviews", + "Non-fiction", + "Social", + "Web", + ], + task_subtypes=["Question answering"], + license="msr-la-nc", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@article{DBLP:journals/corr/NguyenRSGTMD16, author = {Tri Nguyen and Mir Rosenberg and diff --git a/mteb/tasks/Retrieval/eng/MSMARCOv2Retrieval.py b/mteb/tasks/Retrieval/eng/MSMARCOv2Retrieval.py index d3b10738cf..7487abb887 100644 --- a/mteb/tasks/Retrieval/eng/MSMARCOv2Retrieval.py +++ b/mteb/tasks/Retrieval/eng/MSMARCOv2Retrieval.py @@ -21,12 +21,23 @@ class MSMARCOv2(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=[ + "Encyclopaedic", + "Academic", + "Blog", + "News", + "Medical", + "Government", + "Reviews", + "Non-fiction", + "Social", + "Web", + ], + task_subtypes=["Question answering"], + license="msr-la-nc", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@article{DBLP:journals/corr/NguyenRSGTMD16, author = {Tri Nguyen and Mir Rosenberg and diff --git a/mteb/tasks/Retrieval/eng/NQRetrieval.py b/mteb/tasks/Retrieval/eng/NQRetrieval.py index 661bf3e0e2..e81018dbc4 100644 --- a/mteb/tasks/Retrieval/eng/NQRetrieval.py +++ b/mteb/tasks/Retrieval/eng/NQRetrieval.py @@ -21,12 +21,12 @@ class NQ(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Encyclopaedic"], + task_subtypes=["Question answering"], + license="cc-by-nc-sa-3.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", bibtex_citation="""@article{47761,title = {Natural Questions: a Benchmark for Question Answering Research}, author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee @@ -67,4 +67,7 @@ class NQHardNegatives(AbsTaskRetrieval): and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov},year = {2019},journal = {Transactions of the Association of Computational Linguistics}}""", + prompt={ + "query": "Given a question, retrieve Wikipedia passages that answer the question" + }, ) diff --git a/mteb/tasks/Retrieval/eng/QuoraRetrieval.py b/mteb/tasks/Retrieval/eng/QuoraRetrieval.py index 73660fb573..52e6cca4b1 100644 --- a/mteb/tasks/Retrieval/eng/QuoraRetrieval.py +++ b/mteb/tasks/Retrieval/eng/QuoraRetrieval.py @@ -26,12 +26,12 @@ class QuoraRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Web", "Blog"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", bibtex_citation="""@misc{quora-question-pairs, author = {DataCanary, hilfialkaff, Lili Jiang, Meg Risdal, Nikhil Dandekar, tomtung}, title = {Quora Question Pairs}, diff --git a/mteb/tasks/Retrieval/eng/SciFactRetrieval.py b/mteb/tasks/Retrieval/eng/SciFactRetrieval.py index 8caa0c2af5..a44eb052bd 100644 --- a/mteb/tasks/Retrieval/eng/SciFactRetrieval.py +++ b/mteb/tasks/Retrieval/eng/SciFactRetrieval.py @@ -22,8 +22,8 @@ class SciFact(AbsTaskRetrieval): main_score="ndcg_at_10", date=None, domains=["Academic", "Medical", "Written"], - task_subtypes=None, - license=None, + task_subtypes=[], + license="not specified", annotations_creators=None, dialect=None, sample_creation=None, diff --git a/mteb/tasks/Retrieval/fas/BEIRFa.py b/mteb/tasks/Retrieval/fas/BEIRFa.py new file mode 100644 index 0000000000..0952eefff9 --- /dev/null +++ b/mteb/tasks/Retrieval/fas/BEIRFa.py @@ -0,0 +1,662 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class ArguAnaFa(AbsTaskRetrieval): + ignore_identical_ids = True + metadata = TaskMetadata( + name="ArguAna-Fa", + description="ArguAna-Fa", + reference="https://huggingface.co/datasets/MCINext/arguana-fa", + dataset={ + "path": "MCINext/arguana-fa", + "revision": "fa97814be356fe4d18caadb457b4469bd34019ca", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Blog"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class ClimateFEVERFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="ClimateFEVER-Fa", + description="ClimateFEVER-Fa", + reference="https://huggingface.co/datasets/MCINext/climate-fever-fa", + dataset={ + "path": "MCINext/climate-fever-fa", + "revision": "45d9176b6fcba33abc58494ee82f18ee7e8ddbae", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackAndroidRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackAndroidRetrieval-Fa", + description="CQADupstackAndroidRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-android-fa", + dataset={ + "path": "MCINext/cqadupstack-android-fa", + "revision": "bcdaf4e30477eea9b9b17ecbb153ca403e5c3ebd", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackEnglishRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackEnglishRetrieval-Fa", + description="CQADupstackEnglishRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-english-fa", + dataset={ + "path": "MCINext/cqadupstack-english-fa", + "revision": "029a2e69e7d9e68b6bdc471073606104417a5be7", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackGamingRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackGamingRetrieval-Fa", + description="CQADupstackGamingRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-gaming-fa", + dataset={ + "path": "MCINext/cqadupstack-gaming-fa", + "revision": "e9c7ad03f29a55ab14eae730146961b8cdc14227", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackGisRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackGisRetrieval-Fa", + description="CQADupstackGisRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-gis-fa", + dataset={ + "path": "MCINext/cqadupstack-gis-fa", + "revision": "e907c4144dc27bc8a035d78d69e15f39c56623a9", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackMathematicaRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackMathematicaRetrieval-Fa", + description="CQADupstackMathematicaRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-mathematica-fa", + dataset={ + "path": "MCINext/cqadupstack-mathematica-fa", + "revision": "b92e24fab42ab599535a19ee744de5485ec92f64", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackPhysicsRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackPhysicsRetrieval-Fa", + description="CQADupstackPhysicsRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa", + dataset={ + "path": "MCINext/cqadupstack-physics-fa", + "revision": "816ad7473d6813f77a0ca5e72b1ff7a52752d370", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackProgrammersRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackProgrammersRetrieval-Fa", + description="CQADupstackProgrammersRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa", + dataset={ + "path": "MCINext/cqadupstack-programmers-fa", + "revision": "be6460df57ab7c1b2c9fe295a31660dbd077ecf0", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackStatsRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackStatsRetrieval-Fa", + description="CQADupstackStatsRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa", + dataset={ + "path": "MCINext/cqadupstack-stats-fa", + "revision": "c6e2c8b6153958118ec04352dd82a30ea2e2cad5", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackTexRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackTexRetrieval-Fa", + description="CQADupstackTexRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-tex-fa", + dataset={ + "path": "MCINext/cqadupstack-tex-fa", + "revision": "860d152c86fda27229270b6bf4e832ff374ac01b", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackUnixRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackUnixRetrieval-Fa", + description="CQADupstackUnixRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-unix-fa", + dataset={ + "path": "MCINext/cqadupstack-unix-fa", + "revision": "c2a326387954aad66ff00d324a9278067b1e3bb6", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackWebmastersRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackWebmastersRetrieval-Fa", + description="CQADupstackWebmastersRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-webmasters-fa", + dataset={ + "path": "MCINext/cqadupstack-webmasters-fa", + "revision": "506f29f8ce59648efe99afee736b0b158eced516", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackWordpressRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackWordpressRetrieval-Fa", + description="CQADupstackWordpressRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-wordpress-fa", + dataset={ + "path": "MCINext/cqadupstack-wordpress-fa", + "revision": "7f755e88647b4023df52da04d4e3d31a7de5fcb0", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class DBPediaFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="DBPedia-Fa", + description="DBPedia-Fa", + reference="https://huggingface.co/datasets/MCINext/dbpedia-fa", + dataset={ + "path": "MCINext/dbpedia-fa", + "revision": "13529e6e301e9d72f86def882cfbca04791d83f9", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Encyclopaedic"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class FiQA2018Fa(AbsTaskRetrieval): + ignore_identical_ids = True + + metadata = TaskMetadata( + name="FiQA2018-Fa", + description="FiQA2018-Fa", + reference="https://huggingface.co/datasets/MCINext/fiqa-fa", + dataset={ + "path": "MCINext/fiqa-fa", + "revision": "e683ce7ecd0b47edc3e29fda7cfd75335be4a997", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class HotpotQAFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="HotpotQA-Fa", + description="HotpotQA-Fa", + reference="https://huggingface.co/datasets/MCINext/hotpotqa-fa", + dataset={ + "path": "MCINext/hotpotqa-fa", + "revision": "1cafde1306aa56b5dfdce0b14633ae9ee1a63ddb", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Encyclopaedic"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class MSMARCOFa(AbsTaskRetrieval): + ignore_identical_ids = True + + metadata = TaskMetadata( + name="MSMARCO-Fa", + description="MSMARCO-Fa", + reference="https://huggingface.co/datasets/MCINext/msmarco-fa", + dataset={ + "path": "MCINext/msmarco-fa", + "revision": "88f90b0b04f91778ba5341095b0a9f1d7799ce10", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["dev"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class NFCorpusFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NFCorpus-Fa", + description="NFCorpus-Fa", + reference="https://huggingface.co/datasets/MCINext/nfcorpus-fa", + dataset={ + "path": "MCINext/nfcorpus-fa", + "revision": "70aa71825a791e87210c0355a01f538aa611feae", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Medical"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class NQFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NQ-Fa", + description="NQ-Fa", + reference="https://huggingface.co/datasets/MCINext/nq-fa", + dataset={ + "path": "MCINext/nq-fa", + "revision": "d4ea898b644c8d5f608b60947cb637bebbf1ac66", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Encyclopaedic"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class QuoraRetrievalFa(AbsTaskRetrieval): + ignore_identical_ids = True + + metadata = TaskMetadata( + name="QuoraRetrieval-Fa", + description="QuoraRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/quora-fa", + dataset={ + "path": "MCINext/quora-fa", + "revision": "1a43f4f5dcd71e6b14b275ae82c3237cdd4fd5fd", + }, + type="Retrieval", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class SCIDOCSFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="SCIDOCS-Fa", + description="SCIDOCS-Fa", + reference="https://huggingface.co/datasets/MCINext/scidocs-fa", + dataset={ + "path": "MCINext/scidocs-fa", + "revision": "6611ebf4b4c1aaf8b021e4728440db2188291b8b", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Academic"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class SciFactFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="SciFact-Fa", + description="SciFact-Fa", + reference="https://huggingface.co/datasets/MCINext/scifact-fa", + dataset={ + "path": "MCINext/scifact-fa", + "revision": "7723397096199c4d6f367b445fccaf282c518abe", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Academic"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class TRECCOVIDFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="TRECCOVID-Fa", + description="TRECCOVID-Fa", + reference="https://huggingface.co/datasets/MCINext/trec-covid-fa", + dataset={ + "path": "MCINext/trec-covid-fa", + "revision": "98e6c2d33dfa166ee326e8b1bc7ea82c7e6898dd", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Medical"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class Touche2020Fa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="Touche2020-Fa", + description="Touche2020-Fa", + reference="https://huggingface.co/datasets/MCINext/touche2020-fa", + dataset={ + "path": "MCINext/touche2020-fa", + "revision": "0f464636f91641cc6ef6f6f8f249c73f4a609982", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) diff --git a/mteb/tasks/Retrieval/fas/FaMTEBRetrieval.py b/mteb/tasks/Retrieval/fas/FaMTEBRetrieval.py new file mode 100644 index 0000000000..875f7ea7db --- /dev/null +++ b/mteb/tasks/Retrieval/fas/FaMTEBRetrieval.py @@ -0,0 +1,140 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class SynPerQARetrieval(AbsTaskRetrieval): + ignore_identical_ids = True + metadata = TaskMetadata( + name="SynPerQARetrieval", + description="Synthetic Persian QA Retrieval", + reference="https://huggingface.co/datasets/MCINext/synthetic-persian-qa-retrieval/settings", + dataset={ + "path": "MCINext/synthetic-persian-qa-retrieval", + "revision": "e85114f13f42dc1edc456d58931cc38d44d697cf", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation="""""", + ) + + +class SynPerChatbotTopicsRetrieval(AbsTaskRetrieval): + ignore_identical_ids = True + metadata = TaskMetadata( + name="SynPerChatbotTopicsRetrieval", + description="Synthetic Persian Chatbot Topics Retrieval", + reference="https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-topics-retrieval", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-topics-retrieval", + "revision": "086995ca4cea33f37a407c2fa5282f74913740ee", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation="""""", + ) + + +class SynPerChatbotRAGTopicsRetrieval(AbsTaskRetrieval): + ignore_identical_ids = True + metadata = TaskMetadata( + name="SynPerChatbotRAGTopicsRetrieval", + description="Synthetic Persian Chatbot RAG Topics Retrieval", + reference="https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-rag-topics-retrieval", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-rag-topics-retrieval", + "revision": "da8f36a723da155738f5e3d8d84d543589bd5083", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation="""""", + ) + + +class SynPerChatbotRAGFAQRetrieval(AbsTaskRetrieval): + ignore_identical_ids = True + metadata = TaskMetadata( + name="SynPerChatbotRAGFAQRetrieval", + description="Synthetic Persian Chatbot RAG FAQ Retrieval", + reference="https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-rag-faq-retrieval", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-rag-faq-retrieval", + "revision": "9d32af6540970e2845028cbfffe6b0d0e8f52428", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation="""""", + ) + + +class PersianWebDocumentRetrieval(AbsTaskRetrieval): + ignore_identical_ids = True + metadata = TaskMetadata( + name="PersianWebDocumentRetrieval", + description="Persian dataset designed specifically for the task of text information retrieval through the web.", + reference="https://ieeexplore.ieee.org/document/10553090", + dataset={ + "path": "MCINext/persian-web-document-retrieval", + "revision": "b3dc818368a867b30ccb55a42ff287d253512c36", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation="""""", + ) diff --git a/mteb/tasks/Retrieval/fas/__init__.py b/mteb/tasks/Retrieval/fas/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/Retrieval/kor/AutoRAGRetrieval.py b/mteb/tasks/Retrieval/kor/AutoRAGRetrieval.py index 4a24e04e9c..6eec67aad2 100644 --- a/mteb/tasks/Retrieval/kor/AutoRAGRetrieval.py +++ b/mteb/tasks/Retrieval/kor/AutoRAGRetrieval.py @@ -22,7 +22,7 @@ class AutoRAGRetrieval(AbsTaskRetrieval): eval_langs=["kor-Hang"], main_score="ndcg_at_10", date=("2024-08-03", "2024-08-03"), - domains=["Government", "Medical", "Legal", "Social"], + domains=["Government", "Medical", "Legal", "Social", "Financial"], task_subtypes=["Article retrieval"], license="mit", annotations_creators="human-annotated", diff --git a/mteb/tasks/Retrieval/pol/ArguAnaPLRetrieval.py b/mteb/tasks/Retrieval/pol/ArguAnaPLRetrieval.py index 342f727144..ada5c4ca8e 100644 --- a/mteb/tasks/Retrieval/pol/ArguAnaPLRetrieval.py +++ b/mteb/tasks/Retrieval/pol/ArguAnaPLRetrieval.py @@ -24,11 +24,11 @@ class ArguAnaPL(AbsTaskRetrieval): eval_langs=["pol-Latn"], main_score="ndcg_at_10", date=None, - domains=None, + domains=["Medical", "Written"], task_subtypes=None, - license=None, + license="cc-by-sa-4.0", annotations_creators=None, - dialect=None, + dialect=[], sample_creation=None, bibtex_citation="""@misc{wojtasik2024beirpl, title={BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, diff --git a/mteb/tasks/Retrieval/pol/FiQAPLRetrieval.py b/mteb/tasks/Retrieval/pol/FiQAPLRetrieval.py index 0a125f5e4f..b54f4ae4ed 100644 --- a/mteb/tasks/Retrieval/pol/FiQAPLRetrieval.py +++ b/mteb/tasks/Retrieval/pol/FiQAPLRetrieval.py @@ -24,12 +24,12 @@ class FiQAPLRetrieval(AbsTaskRetrieval): eval_langs=["pol-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Financial"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{ thakur2021beir, title={{BEIR}: A Heterogeneous Benchmark for Zero-shot Evaluation of Information Retrieval Models}, diff --git a/mteb/tasks/STS/__init__.py b/mteb/tasks/STS/__init__.py index b61b79b293..471789f1c9 100644 --- a/mteb/tasks/STS/__init__.py +++ b/mteb/tasks/STS/__init__.py @@ -10,6 +10,7 @@ from .eng.STS16STS import * from .eng.STSBenchmarkSTS import * from .fao.FaroeseSTS import * +from .fas.FaMTEBSTS import * from .fin.FinParaSTS import * from .fra.SickFrSTS import * from .jpn.JSICK import * diff --git a/mteb/tasks/STS/eng/BiossesSTS.py b/mteb/tasks/STS/eng/BiossesSTS.py index ce54e37789..1fc1d5a1d0 100644 --- a/mteb/tasks/STS/eng/BiossesSTS.py +++ b/mteb/tasks/STS/eng/BiossesSTS.py @@ -21,12 +21,12 @@ class BiossesSTS(AbsTaskSTS): eval_langs=["eng-Latn"], main_score="cosine_spearman", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Medical"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@article{10.1093/bioinformatics/btx238, author = {SoฤŸancฤฑoฤŸlu, Gizem and ร–ztรผrk, Hakime and ร–zgรผr, Arzucan}, title = "{BIOSSES: a semantic sentence similarity estimation system for the biomedical domain}", diff --git a/mteb/tasks/STS/eng/STSBenchmarkSTS.py b/mteb/tasks/STS/eng/STSBenchmarkSTS.py index 099fba6773..e600711d34 100644 --- a/mteb/tasks/STS/eng/STSBenchmarkSTS.py +++ b/mteb/tasks/STS/eng/STSBenchmarkSTS.py @@ -21,12 +21,12 @@ class STSBenchmarkSTS(AbsTaskSTS): eval_langs=["eng-Latn"], main_score="cosine_spearman", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Blog", "News", "Written"], + task_subtypes=[], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="machine-translated and verified", bibtex_citation="""@InProceedings{huggingface:dataset:stsb_multi_mt, title = {Machine translated multilingual STS benchmark dataset.}, author={Philip May}, diff --git a/mteb/tasks/STS/fas/FaMTEBSTS.py b/mteb/tasks/STS/fas/FaMTEBSTS.py new file mode 100644 index 0000000000..2ce9522cd4 --- /dev/null +++ b/mteb/tasks/STS/fas/FaMTEBSTS.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskSTS import AbsTaskSTS + + +class Farsick(AbsTaskSTS): + metadata = TaskMetadata( + name="Farsick", + description="A Persian Semantic Textual Similarity And Natural Language Inference Dataset", + reference="https://github.com/ZahraGhasemi-AI/FarSick", + dataset={ + "path": "MCINext/farsick-sts", + "revision": "f8b8d630f631c6c16b7bc3cb924bdf62a51bed06", + }, + type="STS", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="cosine_spearman", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + @property + def metadata_dict(self) -> dict[str, str]: + metadata_dict = super().metadata_dict + metadata_dict["min_score"] = 1 + metadata_dict["max_score"] = 5 + return metadata_dict + + +class SynPerSTS(AbsTaskSTS): + metadata = TaskMetadata( + name="SynPerSTS", + description="Synthetic Persian Semantic Textual Similarity Dataset", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-sts", + "revision": "914047db08928b5326d8b106583dc563b73d1ecf", + }, + type="STS", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="cosine_spearman", + date=("2024-09-01", "2024-12-31"), + domains=["Web", "News", "Religious", "Blog"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + + @property + def metadata_dict(self) -> dict[str, str]: + metadata_dict = super().metadata_dict + metadata_dict["min_score"] = 1 + metadata_dict["max_score"] = 5 + return metadata_dict + + +class Query2Query(AbsTaskSTS): + metadata = TaskMetadata( + name="Query2Query", + description="Query to Query Datasets.", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/query-to-query-sts", + "revision": "52602079f9032fcf181775a310d79d2f197534e4", + }, + type="STS", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="cosine_spearman", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + @property + def metadata_dict(self) -> dict[str, str]: + metadata_dict = super().metadata_dict + metadata_dict["min_score"] = 0 + metadata_dict["max_score"] = 2 + return metadata_dict diff --git a/mteb/tasks/STS/fas/__init__.py b/mteb/tasks/STS/fas/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/SummaryRetrieval/__init__.py b/mteb/tasks/SummaryRetrieval/__init__.py new file mode 100644 index 0000000000..d000983be9 --- /dev/null +++ b/mteb/tasks/SummaryRetrieval/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from .fas.FaMTEBSummaryRetrieval import * diff --git a/mteb/tasks/SummaryRetrieval/fas/FaMTEBSummaryRetrieval.py b/mteb/tasks/SummaryRetrieval/fas/FaMTEBSummaryRetrieval.py new file mode 100644 index 0000000000..f0797068c3 --- /dev/null +++ b/mteb/tasks/SummaryRetrieval/fas/FaMTEBSummaryRetrieval.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SAMSumFa(AbsTaskBitextMining): + metadata = TaskMetadata( + name="SAMSumFa", + description="Translated Version of SAMSum Dataset for summary retrieval.", + reference="https://huggingface.co/datasets/MCINext/samsum-fa", + dataset={ + "path": "MCINext/samsum-fa", + "revision": "fd981d78a0ab82c20d2e693a8b3929c5d71b0743", + }, + type="BitextMining", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="f1", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="machine-translated", + bibtex_citation="", + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_columns( + {"text": "sentence1", "summary": "sentence2"} + ) + + +class SynPerChatbotSumSRetrieval(AbsTaskBitextMining): + metadata = TaskMetadata( + name="SynPerChatbotSumSRetrieval", + description="Synthetic Persian Chatbot Summary Dataset for summary retrieval.", + reference="https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-summary-retrieval", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-summary-retrieval", + "revision": "9002f5e9de4ef61f1f5c34831d2a5ed855bac0ae", + }, + type="BitextMining", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="f1", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_columns( + {"text": "sentence1", "summary": "sentence2"} + ) + + +class SynPerChatbotRAGSumSRetrieval(AbsTaskBitextMining): + metadata = TaskMetadata( + name="SynPerChatbotRAGSumSRetrieval", + description="Synthetic Persian Chatbot RAG Summary Dataset for summary retrieval.", + reference="https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-rag-summary-retrieval", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-rag-summary-retrieval", + "revision": "f77746f286bbf2177ee7b5a803da8be440d5d4c1", + }, + type="BitextMining", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="f1", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_columns( + {"text": "sentence1", "summary": "sentence2"} + ) diff --git a/mteb/tasks/SummaryRetrieval/fas/__init__.py b/mteb/tasks/SummaryRetrieval/fas/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/__init__.py b/mteb/tasks/__init__.py index 8d49517136..e00f091174 100644 --- a/mteb/tasks/__init__.py +++ b/mteb/tasks/__init__.py @@ -1,5 +1,6 @@ from __future__ import annotations +from .aggregated_tasks import * from .BitextMining import * from .Classification import * from .Clustering import * @@ -19,3 +20,4 @@ from .SpeedTask import * from .STS import * from .Summarization import * +from .SummaryRetrieval import * diff --git a/mteb/tasks/aggregated_tasks/CQADupStackRetrieval.py b/mteb/tasks/aggregated_tasks/CQADupStackRetrieval.py new file mode 100644 index 0000000000..917a667eb3 --- /dev/null +++ b/mteb/tasks/aggregated_tasks/CQADupStackRetrieval.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from mteb.abstasks import AbsTask +from mteb.abstasks.aggregated_task import AbsTaskAggregate, AggregateTaskMetadata +from mteb.tasks.Retrieval import ( + CQADupstackAndroidRetrieval, + CQADupstackEnglishRetrieval, + CQADupstackGamingRetrieval, + CQADupstackGisRetrieval, + CQADupstackMathematicaRetrieval, + CQADupstackPhysicsRetrieval, + CQADupstackProgrammersRetrieval, + CQADupstackStatsRetrieval, + CQADupstackTexRetrieval, + CQADupstackUnixRetrieval, + CQADupstackWebmastersRetrieval, + CQADupstackWordpressRetrieval, +) + +task_list_cqa: list[AbsTask] = [ + CQADupstackAndroidRetrieval(), + CQADupstackEnglishRetrieval(), + CQADupstackGamingRetrieval(), + CQADupstackGisRetrieval(), + CQADupstackMathematicaRetrieval(), + CQADupstackPhysicsRetrieval(), + CQADupstackProgrammersRetrieval(), + CQADupstackStatsRetrieval(), + CQADupstackTexRetrieval(), + CQADupstackUnixRetrieval(), + CQADupstackWebmastersRetrieval(), + CQADupstackWordpressRetrieval(), +] + + +class CQADupstackRetrieval(AbsTaskAggregate): + metadata = AggregateTaskMetadata( + name="CQADupstackRetrieval", + description="CQADupStack: A Benchmark Data Set for Community Question-Answering Research", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + tasks=task_list_cqa, + main_score="ndcg_at_10", + type="Retrieval", # since everything is retrieval - otherwise it would be "Aggregated" + eval_splits=["test"], + bibtex_citation="""@inproceedings{hoogeveen2015, +author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, +title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, +booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, +series = {ADCS '15}, +year = {2015}, +isbn = {978-1-4503-4040-3}, +location = {Parramatta, NSW, Australia}, +pages = {3:1--3:8}, +articleno = {3}, +numpages = {8}, +url = {http://doi.acm.org/10.1145/2838931.2838934}, +doi = {10.1145/2838931.2838934}, +acmid = {2838934}, +publisher = {ACM}, +address = {New York, NY, USA}, +}""", + ) diff --git a/mteb/tasks/aggregated_tasks/CQADupStackRetrievalFa.py b/mteb/tasks/aggregated_tasks/CQADupStackRetrievalFa.py new file mode 100644 index 0000000000..6a60f4b000 --- /dev/null +++ b/mteb/tasks/aggregated_tasks/CQADupStackRetrievalFa.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from mteb.abstasks import AbsTask +from mteb.abstasks.aggregated_task import AbsTaskAggregate, AggregateTaskMetadata +from mteb.tasks.Retrieval import ( + CQADupstackAndroidRetrievalFa, + CQADupstackEnglishRetrievalFa, + CQADupstackGamingRetrievalFa, + CQADupstackGisRetrievalFa, + CQADupstackMathematicaRetrievalFa, + CQADupstackPhysicsRetrievalFa, + CQADupstackProgrammersRetrievalFa, + CQADupstackStatsRetrievalFa, + CQADupstackTexRetrievalFa, + CQADupstackUnixRetrievalFa, + CQADupstackWebmastersRetrievalFa, + CQADupstackWordpressRetrievalFa, +) + +task_list_cqa: list[AbsTask] = [ + CQADupstackAndroidRetrievalFa(), + CQADupstackEnglishRetrievalFa(), + CQADupstackGamingRetrievalFa(), + CQADupstackGisRetrievalFa(), + CQADupstackMathematicaRetrievalFa(), + CQADupstackPhysicsRetrievalFa(), + CQADupstackProgrammersRetrievalFa(), + CQADupstackStatsRetrievalFa(), + CQADupstackTexRetrievalFa(), + CQADupstackUnixRetrievalFa(), + CQADupstackWebmastersRetrievalFa(), + CQADupstackWordpressRetrievalFa(), +] + + +class CQADupstackRetrievalFa(AbsTaskAggregate): + metadata = AggregateTaskMetadata( + name="CQADupstackRetrieval-Fa", + description="CQADupstackRetrieval-Fa", + reference="", + tasks=task_list_cqa, + main_score="ndcg_at_10", + type="Retrieval", # since everything is retrieval - otherwise it would be "Aggregated" + eval_splits=["test"], + bibtex_citation=""" """, + ) diff --git a/mteb/tasks/aggregated_tasks/SynPerChatbotConvSAClassification.py b/mteb/tasks/aggregated_tasks/SynPerChatbotConvSAClassification.py new file mode 100644 index 0000000000..46c6ed9600 --- /dev/null +++ b/mteb/tasks/aggregated_tasks/SynPerChatbotConvSAClassification.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from mteb.abstasks import AbsTask +from mteb.abstasks.aggregated_task import AbsTaskAggregate, AggregateTaskMetadata +from mteb.tasks.Classification import ( + SynPerChatbotConvSAAnger, + SynPerChatbotConvSAFear, + SynPerChatbotConvSAFriendship, + SynPerChatbotConvSAHappiness, + SynPerChatbotConvSAJealousy, + SynPerChatbotConvSALove, + SynPerChatbotConvSASadness, + SynPerChatbotConvSASatisfaction, + SynPerChatbotConvSASurprise, +) + +task_list_cqa: list[AbsTask] = [ + SynPerChatbotConvSAAnger(), + SynPerChatbotConvSASatisfaction(), + SynPerChatbotConvSAFriendship(), + SynPerChatbotConvSAFear(), + SynPerChatbotConvSAJealousy(), + SynPerChatbotConvSASurprise(), + SynPerChatbotConvSALove(), + SynPerChatbotConvSASadness(), + SynPerChatbotConvSAHappiness(), +] + + +class SynPerChatbotConvSAClassification(AbsTaskAggregate): + metadata = AggregateTaskMetadata( + name="SynPerChatbotConvSAClassification", + description="SynPerChatbotConvSAClassification", + reference="", + tasks=task_list_cqa, + main_score="accuracy", + type="Classification", + eval_splits=["test"], + bibtex_citation=""" """, + ) diff --git a/mteb/tasks/aggregated_tasks/__init__.py b/mteb/tasks/aggregated_tasks/__init__.py new file mode 100644 index 0000000000..5333db7916 --- /dev/null +++ b/mteb/tasks/aggregated_tasks/__init__.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from .CQADupStackRetrieval import CQADupstackRetrieval +from .CQADupStackRetrievalFa import CQADupstackRetrievalFa +from .SynPerChatbotConvSAClassification import SynPerChatbotConvSAClassification + +__all__ = [ + "CQADupstackRetrieval", + "CQADupstackRetrievalFa", + "SynPerChatbotConvSAClassification", +] diff --git a/pyproject.toml b/pyproject.toml index 2c1040047c..43811556c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.14" +version = "1.31.8" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ @@ -58,7 +58,7 @@ dev = ["ruff==0.6.4", # locked so we don't get PRs which fail only due to a lint codecarbon = ["codecarbon"] speedtask = ["GPUtil>=1.4.0", "psutil>=5.9.8"] peft = ["peft>=0.11.0"] -leaderboard = ["gradio>=5.7.1", "gradio_rangeslider>=0.0.8", "plotly>=5.24.0"] +leaderboard = ["gradio>=5.7.1", "gradio_rangeslider>=0.0.8", "plotly>=5.24.0,<6.0.0"] flagembedding = ["FlagEmbedding"] jina = ["einops>=0.8.0"] flash_attention = ["flash-attn>=2.6.3"] diff --git a/scripts/extract_model_names.py b/scripts/extract_model_names.py index 36cfc572e9..84a81fca26 100644 --- a/scripts/extract_model_names.py +++ b/scripts/extract_model_names.py @@ -28,6 +28,7 @@ def get_changed_files(base_branch="main"): and f.endswith(".py") and "overview" not in f and "init" not in f + and "instructions" not in f ] diff --git a/tests/test_TaskMetadata.py b/tests/test_TaskMetadata.py index 2b606c2c19..f7ac92a697 100644 --- a/tests/test_TaskMetadata.py +++ b/tests/test_TaskMetadata.py @@ -59,6 +59,7 @@ "AILAStatutes", "ArguAna", "ClimateFEVER", + "CQADupstackRetrieval", "CQADupstackAndroidRetrieval", "CQADupstackEnglishRetrieval", "CQADupstackGamingRetrieval", @@ -178,6 +179,8 @@ "TamilNewsClassification", "TenKGnadClusteringP2P.v2", "TenKGnadClusteringS2S.v2", + "SynPerChatbotConvSAClassification", + "CQADupstackRetrieval-Fa", ] diff --git a/tests/test_overview.py b/tests/test_overview.py index 127e54f279..7041328a59 100644 --- a/tests/test_overview.py +++ b/tests/test_overview.py @@ -20,7 +20,7 @@ def test_get_tasks_size_differences(): ) -@pytest.mark.parametrize("task_name", ["BornholmBitextMining"]) +@pytest.mark.parametrize("task_name", ["BornholmBitextMining", "CQADupstackRetrieval"]) @pytest.mark.parametrize("eval_splits", [["test"], None]) def test_get_task(task_name: str, eval_splits: list[str] | None): task = get_task(task_name, eval_splits=eval_splits) diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py index 902c541390..7a87914f0a 100644 --- a/tests/test_tasks/test_all_abstasks.py +++ b/tests/test_tasks/test_all_abstasks.py @@ -13,6 +13,7 @@ from mteb.abstasks.AbsTaskInstructionRetrieval import AbsTaskInstructionRetrieval from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval from mteb.abstasks.AbsTaskSpeedTask import AbsTaskSpeedTask +from mteb.abstasks.aggregated_task import AbsTaskAggregate from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import AbsTaskAny2AnyMultiChoice from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval from mteb.abstasks.MultiSubsetLoader import MultiSubsetLoader @@ -93,6 +94,8 @@ async def check_datasets_are_available_on_hf(tasks): def test_dataset_availability(): """Checks if the datasets are available on Hugging Face using both their name and revision.""" tasks = MTEB().tasks_cls + # do not check aggregated tasks as they don't have a dataset + tasks = [t for t in tasks if not isinstance(t, AbsTaskAggregate)] tasks = [ t for t in tasks